diff --git a/CMakeLists.txt b/CMakeLists.txt index 7abf997dd..033cee5bf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -333,6 +333,40 @@ if(NOT GNURADIO_TRELLIS_FOUND) endif() +############################################################################### +# Volk_gnsssdr module +#In order to use volk_gnsssr module it is necessary to add: +# 1) include_directories(..${VOLK_GNSSSDR_INCLUDE_DIRS}..) +# 2) target_link_libraries(..${VOLK_GNSSSDR_LIBRARIES}..) +############################################################################### + +if(ENABLE_VOLK_GNSSSDR) + message(STATUS "The volk_gnsssdr module with custom protokernels coded by gnss-sdr will be compiled.") + message(STATUS "You can disable it with 'cmake -DENABLE_VOLK_GNSSSDR=OFF ../'" ) +else(ENABLE_VOLK_GNSSSDR) + message(STATUS "The volk_gnsssdr module with custom protokernels coded by gnss-sdr is not enabled. Some configurations that use custom protokernels will not work." ) + message(STATUS "Enable it with 'cmake -D ENABLE_VOLK_GNSSSDR=ON ../'." ) +endif(ENABLE_VOLK_GNSSSDR) + +if(ENABLE_VOLK_GNSSSDR) + set(VOLK_GNSSSDR_BASE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/src/algorithms/libs/volk_gnsssdr) + add_subdirectory(${VOLK_GNSSSDR_BASE_PATH}) + + set(VOLK_GNSSSDR_INCLUDE_DIRS + ${VOLK_GNSSSDR_BASE_PATH}/include + ${CMAKE_CURRENT_BINARY_DIR}/src/algorithms/libs/volk_gnsssdr/include + ) + + set(VOLK_GNSSSDR_LIBRARIES + #Path to libs of volk_gnsssdr target: ${VOLK_GNSSSDR_BASE_PATH}/lib/Debug/libvolk_gnsssdr.dylib + volk_gnsssdr + ) + + message(" * INCLUDES: ${VOLK_GNSSSDR_INCLUDE_DIRS} ") + message(" * LIBS: ${VOLK_GNSSSDR_LIBRARIES} ") + message("-- END OF: Setup volk_gnsssdr as a subproject.") +endif(ENABLE_VOLK_GNSSSDR) + ################################################################################ # gflags - http://code.google.com/p/gflags/ diff --git a/src/algorithms/libs/volk_gnsssdr/CMakeLists.txt b/src/algorithms/libs/volk_gnsssdr/CMakeLists.txt new file mode 100644 index 000000000..77481beda --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/CMakeLists.txt @@ -0,0 +1,183 @@ +# +# Copyright 2011 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +######################################################################## +# Project setup +######################################################################## +cmake_minimum_required(VERSION 2.6) +if(NOT DEFINED CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() +set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "Choose build type: None Debug Release RelWithDebInfo MinSizeRel") +project(volk_gnsssdr) +enable_language(CXX) +enable_language(C) +enable_testing() +set(VERSION 0.1) +set(LIBVER 0.0.0) + +set(CMAKE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) #allows this to be a sub-project +set(CMAKE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) #allows this to be a sub-project +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) #location for custom "Modules" + +######################################################################## +# Environment setup +######################################################################## +IF(NOT DEFINED BOOST_ROOT) + SET(BOOST_ROOT ${CMAKE_INSTALL_PREFIX}) +ENDIF() + +IF(NOT DEFINED CROSSCOMPILE_MULTILIB) + SET(CROSSCOMPILE_MULTILIB "") +ENDIF() +SET(CROSSCOMPILE_MULTILIB ${CROSSCOMPILE_MULTILIB} CACHE STRING "Define \"true\" if you have and want to use multiple C development libs installed for cross compile") + + +######################################################################## +# Dependencies setup +######################################################################## +include(GrPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B +VOLK_PYTHON_CHECK_MODULE("python >= 2.5" sys "sys.version.split()[0] >= '2.5'" PYTHON_MIN_VER_FOUND) +VOLK_PYTHON_CHECK_MODULE("Cheetah >= 2.0.0" Cheetah "Cheetah.Version >= '2.0.0'" CHEETAH_FOUND) + +if(NOT PYTHON_MIN_VER_FOUND) + message(FATAL_ERROR "Python 2.5 or greater required to build VOLK") +endif() + +if(NOT CHEETAH_FOUND) + message(FATAL_ERROR "Cheetah templates required to build VOLK") +endif() + +if(MSVC) + if (NOT DEFINED BOOST_ALL_DYN_LINK) + set(BOOST_ALL_DYN_LINK TRUE) + endif() + set(BOOST_ALL_DYN_LINK "${BOOST_ALL_DYN_LINK}" CACHE BOOL "boost enable dynamic linking") + if(BOOST_ALL_DYN_LINK) + add_definitions(-DBOOST_ALL_DYN_LINK) #setup boost auto-linking in msvc + else(BOOST_ALL_DYN_LINK) + unset(BOOST_REQUIRED_COMPONENTS) #empty components list for static link + endif(BOOST_ALL_DYN_LINK) +endif(MSVC) +include(VolkBoost) + +if(NOT Boost_FOUND) + message(FATAL_ERROR "VOLK Requires boost to build") +endif() + +option(ENABLE_ORC "Enable Orc" True) +if(ENABLE_ORC) + find_package(ORC) +else(ENABLE_ORC) + message(STATUS "Disabling use of ORC") +endif(ENABLE_ORC) + +######################################################################## +# Setup the package config file +######################################################################## +#set variables found in the pc.in file +set(prefix ${CMAKE_INSTALL_PREFIX}) +set(exec_prefix "\${prefix}") +set(libdir "\${exec_prefix}/lib${LIB_SUFFIX}") +set(includedir "\${prefix}/include") + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr.pc +@ONLY) + +install( + FILES ${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr.pc + DESTINATION lib${LIB_SUFFIX}/pkgconfig + COMPONENT "volk_gnsssdr_devel" +) + +######################################################################## +# Install all headers in the include directories +######################################################################## +set(VOLK_RUNTIME_DIR bin) +set(VOLK_LIBRARY_DIR lib${LIB_SUFFIX}) +set(VOLK_INCLUDE_DIR include) + +install( + DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk_gnsssdr + DESTINATION include COMPONENT "volk_gnsssdr_devel" + FILES_MATCHING PATTERN "*.h" +) + +install(FILES + ${CMAKE_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_prefs.h + ${CMAKE_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_complex.h + ${CMAKE_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_common.h + ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr.h + ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_cpu.h + ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_config_fixed.h + ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_typedefs.h + ${CMAKE_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_malloc.h + DESTINATION include/volk_gnsssdr + COMPONENT "volk_gnsssdr_devel" +) + +######################################################################## +# Install cmake search routine for external use +######################################################################## + +if(NOT CMAKE_MODULES_DIR) + set(CMAKE_MODULES_DIR lib${LIB_SUFFIX}/cmake) +endif(NOT CMAKE_MODULES_DIR) + +install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/VolkConfig.cmake + DESTINATION ${CMAKE_MODULES_DIR}/volk_gnsssdr + COMPONENT "volk_gnsssdr_devel" +) + +######################################################################## +# On Apple only, set install name and use rpath correctly, if not already set +######################################################################## +if(APPLE) + if(NOT CMAKE_INSTALL_NAME_DIR) + set(CMAKE_INSTALL_NAME_DIR + ${CMAKE_INSTALL_PREFIX}/${GR_LIBRARY_DIR} CACHE + PATH "Library Install Name Destination Directory" FORCE) + endif(NOT CMAKE_INSTALL_NAME_DIR) + if(NOT CMAKE_INSTALL_RPATH) + set(CMAKE_INSTALL_RPATH + ${CMAKE_INSTALL_PREFIX}/${GR_LIBRARY_DIR} CACHE + PATH "Library Install RPath" FORCE) + endif(NOT CMAKE_INSTALL_RPATH) + if(NOT CMAKE_BUILD_WITH_INSTALL_RPATH) + set(CMAKE_BUILD_WITH_INSTALL_RPATH ON CACHE + BOOL "Do Build Using Library Install RPath" FORCE) + endif(NOT CMAKE_BUILD_WITH_INSTALL_RPATH) +endif(APPLE) + +######################################################################## +# Setup the library +######################################################################## +add_subdirectory(lib) + +######################################################################## +# And the utility apps +######################################################################## +add_subdirectory(apps) +add_subdirectory(python/volk_gnsssdr_modtool) + +######################################################################## +# Print summary +######################################################################## +message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}") diff --git a/src/algorithms/libs/volk_gnsssdr/apps/CMakeLists.txt b/src/algorithms/libs/volk_gnsssdr/apps/CMakeLists.txt new file mode 100644 index 000000000..3158c4280 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/apps/CMakeLists.txt @@ -0,0 +1,61 @@ +# +# Copyright 2011-2013 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +######################################################################## +# Setup profiler +######################################################################## +if(Boost_FOUND) + +if(MSVC) + include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc) +endif(MSVC) + +include_directories( + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_BINARY_DIR}/include + ${CMAKE_SOURCE_DIR}/lib + ${CMAKE_BINARY_DIR}/lib + ${Boost_INCLUDE_DIRS} +) + +# MAKE volk_gnsssdr_profile +add_executable(volk_gnsssdr_profile + ${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr_profile.cc + ${CMAKE_SOURCE_DIR}/lib/qa_utils.cc +) + +target_link_libraries(volk_gnsssdr_profile volk_gnsssdr ${Boost_LIBRARIES}) + +install( + TARGETS volk_gnsssdr_profile + DESTINATION bin + COMPONENT "volk_gnsssdr" +) + +# MAKE volk_gnsssdr-config-info +add_executable(volk_gnsssdr-config-info volk_gnsssdr-config-info.cc) +target_link_libraries(volk_gnsssdr-config-info volk_gnsssdr ${Boost_LIBRARIES}) + +install( + TARGETS volk_gnsssdr-config-info + DESTINATION bin + COMPONENT "volk_gnsssdr" +) + +endif(Boost_FOUND) diff --git a/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc new file mode 100644 index 000000000..ec8c09525 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc @@ -0,0 +1,96 @@ +/* -*- c++ -*- */ +/* + * Copyright 2013 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include "volk_gnsssdr/volk_gnsssdr.h" +#include +#include + +namespace po = boost::program_options; + +int +main(int argc, char **argv) +{ + po::options_description desc("Program options: volk_gnsssdr-config-info [options]"); + po::variables_map vm; + + desc.add_options() + ("help,h", "print help message") + ("prefix", "print VOLK installation prefix") + ("builddate", "print VOLK build date (RFC2822 format)") + ("cc", "print VOLK C compiler version") + ("cflags", "print VOLK CFLAGS") + ("all-machines", "print VOLK machines built into library") + ("avail-machines", "print VOLK machines the current platform can use") + ("machine", "print the VOLK machine that will be used") + ("version,v", "print VOLK version") + ; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + } + catch (po::error& error){ + std::cerr << "Error: " << error.what() << std::endl << std::endl; + std::cerr << desc << std::endl; + return 1; + } + + if(vm.size() == 0 || vm.count("help")) { + std::cout << desc << std::endl; + return 1; + } + + if(vm.count("prefix")) + std::cout << volk_gnsssdr_prefix() << std::endl; + + if(vm.count("builddate")) + std::cout << volk_gnsssdr_build_date() << std::endl; + + if(vm.count("version")) + std::cout << volk_gnsssdr_version() << std::endl; + + if(vm.count("cc")) + std::cout << volk_gnsssdr_c_compiler() << std::endl; + + if(vm.count("cflags")) + std::cout << volk_gnsssdr_compiler_flags() << std::endl; + + // stick an extra ';' to make output of this and avail-machines the + // same structure for easier parsing + if(vm.count("all-machines")) + std::cout << volk_gnsssdr_available_machines() << ";" << std::endl; + + if(vm.count("avail-machines")) { + volk_gnsssdr_list_machines(); + } + + if(vm.count("machine")) { + std::cout << volk_gnsssdr_get_machine() << std::endl; + } + + return 0; +} diff --git a/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc new file mode 100644 index 000000000..ed5c0d53d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/apps/volk_gnsssdr_profile.cc @@ -0,0 +1,239 @@ +/* -*- c++ -*- */ +/* + * Copyright 2012-2014 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include "qa_utils.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = boost::filesystem; + +void write_json(std::ofstream &json_file, std::vector results) { + json_file << "{" << std::endl; + json_file << " \"volk_tests\": [" << std::endl; + size_t len = results.size(); + size_t i = 0; + BOOST_FOREACH(volk_gnsssdr_test_results_t &result, results) { + json_file << " {" << std::endl; + json_file << " \"name\": \"" << result.name << "\"," << std::endl; + json_file << " \"vlen\": " << result.vlen << "," << std::endl; + json_file << " \"iter\": " << result.iter << "," << std::endl; + json_file << " \"best_arch_a\": \"" << result.best_arch_a + << "\"," << std::endl; + json_file << " \"best_arch_u\": \"" << result.best_arch_u + << "\"," << std::endl; + json_file << " \"results\": {" << std::endl; + size_t results_len = result.results.size(); + size_t ri = 0; + typedef std::pair tpair; + BOOST_FOREACH(tpair pair, result.results) { + volk_gnsssdr_test_time_t time = pair.second; + json_file << " \"" << time.name << "\": {" << std::endl; + json_file << " \"name\": \"" << time.name << "\"," << std::endl; + json_file << " \"time\": " << time.time << "," << std::endl; + json_file << " \"units\": \"" << time.units << "\"" << std::endl; + json_file << " }" ; + if(ri+1 != results_len) { + json_file << ","; + } + json_file << std::endl; + ri++; + } + json_file << " }" << std::endl; + json_file << " }"; + if(i+1 != len) { + json_file << ","; + } + json_file << std::endl; + i++; + } + json_file << " ]" << std::endl; + json_file << "}" << std::endl; +} + +int main(int argc, char *argv[]) { + // Adding program options + boost::program_options::options_description desc("Options"); + desc.add_options() + ("help,h", "Print help messages") + ("benchmark,b", + boost::program_options::value()->default_value( false ) + ->implicit_value( true ), + "Run all kernels (benchmark mode)") + ("tests-regex,R", + boost::program_options::value(), + "Run tests matching regular expression.") + ("json,j", + boost::program_options::value(), + "JSON output file") + ; + + // Handle the options that were given + boost::program_options::variables_map vm; + bool benchmark_mode; + std::string kernel_regex; + bool store_results = true; + std::ofstream json_file; + + try { + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); + boost::program_options::notify(vm); + benchmark_mode = vm.count("benchmark")?vm["benchmark"].as():false; + if ( vm.count("tests-regex" ) ) { + kernel_regex = vm["tests-regex"].as(); + store_results = false; + std::cout << "Warning: using a regexp will not save results to a config" << std::endl; + } + else { + kernel_regex = ".*"; + store_results = true; + } + } catch (boost::program_options::error& error) { + std::cerr << "Error: " << error.what() << std::endl << std::endl; + std::cerr << desc << std::endl; + return 1; + } + /** --help option + */ + if ( vm.count("help") ) + { + std::cout << "The VOLK profiler." << std::endl + << desc << std::endl; + return 0; + } + + if ( vm.count("json") ) + { + json_file.open( vm["json"].as().c_str() ); + } + + + // Run tests + std::vector results; + + //VOLK_PROFILE(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_32u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204602, 1000, &results, benchmark_mode, kernel_regex); + + //GNSS-SDR PROTO-KERNELS + //lv_32fc_t sfv = lv_cmake((float)1, (float)2); + //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex); + + //CAN NOT BE TESTED YET BECAUSE VOLK MODULE DOES NOT SUPPORT IT: + //VOLK_PROFILE(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 16007, 1, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 7, 1, &results, benchmark_mode, kernel_regex); + + VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + + VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + + VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + + VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex); + + /*VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); + VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);*/ + + // Until we can update the config on a kernel by kernel basis + // do not overwrite volk_config when using a regex. + if(store_results) { + char path[1024]; + volk_gnsssdr_get_config_path(path); + + const fs::path config_path(path); + + if (not fs::exists(config_path.branch_path())) + { + std::cout << "Creating " << config_path.branch_path() << "..." << std::endl; + fs::create_directories(config_path.branch_path()); + } + + std::cout << "Writing " << config_path << "..." << std::endl; + std::ofstream config(config_path.string().c_str()); + if(!config.is_open()) { //either we don't have write access or we don't have the dir yet + std::cout << "Error opening file " << config_path << std::endl; + } + + config << "\ + #this file is generated by volk_profile.\n\ + #the function name is followed by the preferred architecture.\n\ + "; + + BOOST_FOREACH(volk_gnsssdr_test_results_t result, results) { + config << result.config_name << " " + << result.best_arch_a << " " + << result.best_arch_u << std::endl; + } + config.close(); + } + else { + std::cout << "Warning: config not generated" << std::endl; + } +} diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/CMakeParseArgumentsCopy.cmake b/src/algorithms/libs/volk_gnsssdr/cmake/CMakeParseArgumentsCopy.cmake new file mode 100644 index 000000000..7ce4c49ae --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/CMakeParseArgumentsCopy.cmake @@ -0,0 +1,138 @@ +# CMAKE_PARSE_ARGUMENTS( args...) +# +# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions for +# parsing the arguments given to that macro or function. +# It processes the arguments and defines a set of variables which hold the +# values of the respective options. +# +# The argument contains all options for the respective macro, +# i.e. keywords which can be used when calling the macro without any value +# following, like e.g. the OPTIONAL keyword of the install() command. +# +# The argument contains all keywords for this macro +# which are followed by one value, like e.g. DESTINATION keyword of the +# install() command. +# +# The argument contains all keywords for this macro +# which can be followed by more than one value, like e.g. the TARGETS or +# FILES keywords of the install() command. +# +# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the +# keywords listed in , and +# a variable composed of the given +# followed by "_" and the name of the respective keyword. +# These variables will then hold the respective value from the argument list. +# For the keywords this will be TRUE or FALSE. +# +# All remaining arguments are collected in a variable +# _UNPARSED_ARGUMENTS, this can be checked afterwards to see whether +# your macro was called with unrecognized parameters. +# +# As an example here a my_install() macro, which takes similar arguments as the +# real install() command: +# +# function(MY_INSTALL) +# set(options OPTIONAL FAST) +# set(oneValueArgs DESTINATION RENAME) +# set(multiValueArgs TARGETS CONFIGURATIONS) +# cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} ) +# ... +# +# Assume my_install() has been called like this: +# my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub) +# +# After the cmake_parse_arguments() call the macro will have set the following +# variables: +# MY_INSTALL_OPTIONAL = TRUE +# MY_INSTALL_FAST = FALSE (this option was not used when calling my_install() +# MY_INSTALL_DESTINATION = "bin" +# MY_INSTALL_RENAME = "" (was not used) +# MY_INSTALL_TARGETS = "foo;bar" +# MY_INSTALL_CONFIGURATIONS = "" (was not used) +# MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL" +# +# You can the continue and process these variables. +# +# Keywords terminate lists of values, e.g. if directly after a one_value_keyword +# another recognized keyword follows, this is interpreted as the beginning of +# the new option. +# E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in +# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would +# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor. + +#============================================================================= +# Copyright 2010 Alexander Neundorf +# +# Distributed under the OSI-approved BSD License (the "License"); +# see accompanying file Copyright.txt for details. +# +# This software is distributed WITHOUT ANY WARRANTY; without even the +# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the License for more information. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + + +if(__CMAKE_PARSE_ARGUMENTS_INCLUDED) + return() +endif() +set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE) + + +function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames) + # first set all result variables to empty/FALSE + foreach(arg_name ${_singleArgNames} ${_multiArgNames}) + set(${prefix}_${arg_name}) + endforeach(arg_name) + + foreach(option ${_optionNames}) + set(${prefix}_${option} FALSE) + endforeach(option) + + set(${prefix}_UNPARSED_ARGUMENTS) + + set(insideValues FALSE) + set(currentArgName) + + # now iterate over all arguments and fill the result variables + foreach(currentArg ${ARGN}) + list(FIND _optionNames "${currentArg}" optionIndex) # ... then this marks the end of the arguments belonging to this keyword + list(FIND _singleArgNames "${currentArg}" singleArgIndex) # ... then this marks the end of the arguments belonging to this keyword + list(FIND _multiArgNames "${currentArg}" multiArgIndex) # ... then this marks the end of the arguments belonging to this keyword + + if(${optionIndex} EQUAL -1 AND ${singleArgIndex} EQUAL -1 AND ${multiArgIndex} EQUAL -1) + if(insideValues) + if("${insideValues}" STREQUAL "SINGLE") + set(${prefix}_${currentArgName} ${currentArg}) + set(insideValues FALSE) + elseif("${insideValues}" STREQUAL "MULTI") + list(APPEND ${prefix}_${currentArgName} ${currentArg}) + endif() + else(insideValues) + list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg}) + endif(insideValues) + else() + if(NOT ${optionIndex} EQUAL -1) + set(${prefix}_${currentArg} TRUE) + set(insideValues FALSE) + elseif(NOT ${singleArgIndex} EQUAL -1) + set(currentArgName ${currentArg}) + set(${prefix}_${currentArgName}) + set(insideValues "SINGLE") + elseif(NOT ${multiArgIndex} EQUAL -1) + set(currentArgName ${currentArg}) + set(${prefix}_${currentArgName}) + set(insideValues "MULTI") + endif() + endif() + + endforeach(currentArg) + + # propagate the result variables to the caller: + foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames}) + set(${prefix}_${arg_name} ${${prefix}_${arg_name}} PARENT_SCOPE) + endforeach(arg_name) + set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE) + +endfunction(CMAKE_PARSE_ARGUMENTS _options _singleArgs _multiArgs) diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/FindORC.cmake b/src/algorithms/libs/volk_gnsssdr/cmake/FindORC.cmake new file mode 100644 index 000000000..f21513f72 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/FindORC.cmake @@ -0,0 +1,36 @@ +FIND_PACKAGE(PkgConfig) +PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.11") + + + + +FIND_PROGRAM(ORCC_EXECUTABLE orcc + HINTS ${PC_ORC_TOOLSDIR} + PATHS ${ORC_ROOT}/bin ${CMAKE_INSTALL_PREFIX}/bin) + +FIND_PATH(ORC_INCLUDE_DIR NAMES orc/orc.h + HINTS ${PC_ORC_INCLUDEDIR} + PATHS ${ORC_ROOT}/include/orc-0.4 ${CMAKE_INSTALL_PREFIX}/include/orc-0.4) + + +FIND_PATH(ORC_LIBRARY_DIR NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX} + HINTS ${PC_ORC_LIBDIR} + PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}) + +FIND_LIBRARY(ORC_LIB orc-0.4 + HINTS ${PC_ORC_LIBRARY_DIRS} + PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}) + +LIST(APPEND ORC_LIBRARY + ${ORC_LIB} +) + + +SET(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR}) +SET(ORC_LIBRARIES ${ORC_LIBRARY}) +SET(ORC_LIBRARY_DIRS ${ORC_LIBRARY_DIR}) + +INCLUDE(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(ORC "orc files" ORC_LIBRARY ORC_INCLUDE_DIR ORCC_EXECUTABLE) + +mark_as_advanced(ORC_INCLUDE_DIR ORC_LIBRARY ORCC_EXECUTABLE) diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/GrPython.cmake b/src/algorithms/libs/volk_gnsssdr/cmake/GrPython.cmake new file mode 100644 index 000000000..b7b561b7b --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/GrPython.cmake @@ -0,0 +1,234 @@ +# Copyright 2010-2011,2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. + +if(DEFINED __INCLUDED_VOLK_PYTHON_CMAKE) + return() +endif() +set(__INCLUDED_VOLK_PYTHON_CMAKE TRUE) + +######################################################################## +# Setup the python interpreter: +# This allows the user to specify a specific interpreter, +# or finds the interpreter via the built-in cmake module. +######################################################################## +#this allows the user to override PYTHON_EXECUTABLE +if(PYTHON_EXECUTABLE) + + set(PYTHONINTERP_FOUND TRUE) + +#otherwise if not set, try to automatically find it +else(PYTHON_EXECUTABLE) + + #use the built-in find script + find_package(PythonInterp 2) + + #and if that fails use the find program routine + if(NOT PYTHONINTERP_FOUND) + find_program(PYTHON_EXECUTABLE NAMES python python2 python2.7 python2.6 python2.5) + if(PYTHON_EXECUTABLE) + set(PYTHONINTERP_FOUND TRUE) + endif(PYTHON_EXECUTABLE) + endif(NOT PYTHONINTERP_FOUND) + +endif(PYTHON_EXECUTABLE) + +#make the path to the executable appear in the cmake gui +set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE FILEPATH "python interpreter") + +#make sure we can use -B with python (introduced in 2.6) +if(PYTHON_EXECUTABLE) + execute_process( + COMMAND ${PYTHON_EXECUTABLE} -B -c "" + OUTPUT_QUIET ERROR_QUIET + RESULT_VARIABLE PYTHON_HAS_DASH_B_RESULT + ) + if(PYTHON_HAS_DASH_B_RESULT EQUAL 0) + set(PYTHON_DASH_B "-B") + endif() +endif(PYTHON_EXECUTABLE) + +######################################################################## +# Check for the existence of a python module: +# - desc a string description of the check +# - mod the name of the module to import +# - cmd an additional command to run +# - have the result variable to set +######################################################################## +macro(VOLK_PYTHON_CHECK_MODULE desc mod cmd have) + message(STATUS "") + message(STATUS "Python checking for ${desc}") + execute_process( + COMMAND ${PYTHON_EXECUTABLE} -c " +######################################### +try: import ${mod} +except: + try: ${mod} + except: exit(-1) +try: assert ${cmd} +except: exit(-1) +#########################################" + RESULT_VARIABLE ${have} + ) + if(${have} EQUAL 0) + message(STATUS "Python checking for ${desc} - found") + set(${have} TRUE) + else(${have} EQUAL 0) + message(STATUS "Python checking for ${desc} - not found") + set(${have} FALSE) + endif(${have} EQUAL 0) +endmacro(VOLK_PYTHON_CHECK_MODULE) + +######################################################################## +# Sets the python installation directory VOLK_PYTHON_DIR +######################################################################## +execute_process(COMMAND ${PYTHON_EXECUTABLE} -c " +from distutils import sysconfig +print sysconfig.get_python_lib(plat_specific=True, prefix='') +" OUTPUT_VARIABLE VOLK_PYTHON_DIR OUTPUT_STRIP_TRAILING_WHITESPACE +) +file(TO_CMAKE_PATH ${VOLK_PYTHON_DIR} VOLK_PYTHON_DIR) + +######################################################################## +# Create an always-built target with a unique name +# Usage: VOLK_UNIQUE_TARGET( ) +######################################################################## +function(VOLK_UNIQUE_TARGET desc) + file(RELATIVE_PATH reldir ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import re, hashlib +unique = hashlib.md5('${reldir}${ARGN}').hexdigest()[:5] +print(re.sub('\\W', '_', '${desc} ${reldir} ' + unique))" + OUTPUT_VARIABLE _target OUTPUT_STRIP_TRAILING_WHITESPACE) + add_custom_target(${_target} ALL DEPENDS ${ARGN}) +endfunction(VOLK_UNIQUE_TARGET) + +######################################################################## +# Install python sources (also builds and installs byte-compiled python) +######################################################################## +function(VOLK_PYTHON_INSTALL) + include(CMakeParseArgumentsCopy) + CMAKE_PARSE_ARGUMENTS(VOLK_PYTHON_INSTALL "" "DESTINATION;COMPONENT" "FILES;PROGRAMS" ${ARGN}) + + #################################################################### + if(VOLK_PYTHON_INSTALL_FILES) + #################################################################### + install(${ARGN}) #installs regular python files + + #create a list of all generated files + unset(pysrcfiles) + unset(pycfiles) + unset(pyofiles) + foreach(pyfile ${VOLK_PYTHON_INSTALL_FILES}) + get_filename_component(pyfile ${pyfile} ABSOLUTE) + list(APPEND pysrcfiles ${pyfile}) + + #determine if this file is in the source or binary directory + file(RELATIVE_PATH source_rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${pyfile}) + string(LENGTH "${source_rel_path}" source_rel_path_len) + file(RELATIVE_PATH binary_rel_path ${CMAKE_CURRENT_BINARY_DIR} ${pyfile}) + string(LENGTH "${binary_rel_path}" binary_rel_path_len) + + #and set the generated path appropriately + if(${source_rel_path_len} GREATER ${binary_rel_path_len}) + set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${binary_rel_path}) + else() + set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${source_rel_path}) + endif() + list(APPEND pycfiles ${pygenfile}c) + list(APPEND pyofiles ${pygenfile}o) + + #ensure generation path exists + get_filename_component(pygen_path ${pygenfile} PATH) + file(MAKE_DIRECTORY ${pygen_path}) + + endforeach(pyfile) + + #the command to generate the pyc files + add_custom_command( + DEPENDS ${pysrcfiles} OUTPUT ${pycfiles} + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pycfiles} + ) + + #the command to generate the pyo files + add_custom_command( + DEPENDS ${pysrcfiles} OUTPUT ${pyofiles} + COMMAND ${PYTHON_EXECUTABLE} -O ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pyofiles} + ) + + #create install rule and add generated files to target list + set(python_install_gen_targets ${pycfiles} ${pyofiles}) + install(FILES ${python_install_gen_targets} + DESTINATION ${VOLK_PYTHON_INSTALL_DESTINATION} + COMPONENT ${VOLK_PYTHON_INSTALL_COMPONENT} + ) + + + #################################################################### + elseif(VOLK_PYTHON_INSTALL_PROGRAMS) + #################################################################### + file(TO_NATIVE_PATH ${PYTHON_EXECUTABLE} pyexe_native) + + if (CMAKE_CROSSCOMPILING) + set(pyexe_native "/usr/bin/env python") + endif() + + foreach(pyfile ${VOLK_PYTHON_INSTALL_PROGRAMS}) + get_filename_component(pyfile_name ${pyfile} NAME) + get_filename_component(pyfile ${pyfile} ABSOLUTE) + string(REPLACE "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" pyexefile "${pyfile}.exe") + list(APPEND python_install_gen_targets ${pyexefile}) + + get_filename_component(pyexefile_path ${pyexefile} PATH) + file(MAKE_DIRECTORY ${pyexefile_path}) + + add_custom_command( + OUTPUT ${pyexefile} DEPENDS ${pyfile} + COMMAND ${PYTHON_EXECUTABLE} -c + "open('${pyexefile}','w').write('\#!${pyexe_native}\\n'+open('${pyfile}').read())" + COMMENT "Shebangin ${pyfile_name}" + VERBATIM + ) + + #on windows, python files need an extension to execute + get_filename_component(pyfile_ext ${pyfile} EXT) + if(WIN32 AND NOT pyfile_ext) + set(pyfile_name "${pyfile_name}.py") + endif() + + install(PROGRAMS ${pyexefile} RENAME ${pyfile_name} + DESTINATION ${VOLK_PYTHON_INSTALL_DESTINATION} + COMPONENT ${VOLK_PYTHON_INSTALL_COMPONENT} + ) + endforeach(pyfile) + + endif() + + VOLK_UNIQUE_TARGET("pygen" ${python_install_gen_targets}) + +endfunction(VOLK_PYTHON_INSTALL) + +######################################################################## +# Write the python helper script that generates byte code files +######################################################################## +file(WRITE ${CMAKE_BINARY_DIR}/python_compile_helper.py " +import sys, py_compile +files = sys.argv[1:] +srcs, gens = files[:len(files)/2], files[len(files)/2:] +for src, gen in zip(srcs, gens): + py_compile.compile(file=src, cfile=gen, doraise=True) +") diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/VolkBoost.cmake b/src/algorithms/libs/volk_gnsssdr/cmake/VolkBoost.cmake new file mode 100644 index 000000000..318820e10 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/VolkBoost.cmake @@ -0,0 +1,98 @@ +# Copyright 2010-2011 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. + +if(DEFINED __INCLUDED_VOLK_BOOST_CMAKE) + return() +endif() +set(__INCLUDED_VOLK_BOOST_CMAKE TRUE) + +######################################################################## +# Setup Boost and handle some system specific things +######################################################################## + +set(BOOST_REQUIRED_COMPONENTS + filesystem + system + unit_test_framework + program_options +) + +if(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64") + list(APPEND BOOST_LIBRARYDIR "/usr/lib64") #fedora 64-bit fix +endif(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64") + +if(MSVC) + set(BOOST_REQUIRED_COMPONENTS ${BOOST_REQUIRED_COMPONENTS} chrono) + + if (NOT DEFINED BOOST_ALL_DYN_LINK) + set(BOOST_ALL_DYN_LINK TRUE) + endif() + set(BOOST_ALL_DYN_LINK "${BOOST_ALL_DYN_LINK}" CACHE BOOL "boost enable dynamic linking") + if(BOOST_ALL_DYN_LINK) + add_definitions(-DBOOST_ALL_DYN_LINK) #setup boost auto-linking in msvc + else(BOOST_ALL_DYN_LINK) + unset(BOOST_REQUIRED_COMPONENTS) #empty components list for static link + endif(BOOST_ALL_DYN_LINK) +endif(MSVC) + +find_package(Boost "1.35" COMPONENTS ${BOOST_REQUIRED_COMPONENTS}) + +# This does not allow us to disable specific versions. It is used +# internally by cmake to know the formation newer versions. As newer +# Boost version beyond what is shown here are produced, we must extend +# this list. To disable Boost versions, see below. +set(Boost_ADDITIONAL_VERSIONS + "1.35.0" "1.35" "1.36.0" "1.36" "1.37.0" "1.37" "1.38.0" "1.38" "1.39.0" "1.39" + "1.40.0" "1.40" "1.41.0" "1.41" "1.42.0" "1.42" "1.43.0" "1.43" "1.44.0" "1.44" + "1.45.0" "1.45" "1.46.0" "1.46" "1.47.0" "1.47" "1.48.0" "1.48" "1.49.0" "1.49" + "1.50.0" "1.50" "1.51.0" "1.51" "1.52.0" "1.52" "1.53.0" "1.53" "1.54.0" "1.54" + "1.55.0" "1.55" "1.56.0" "1.56" "1.57.0" "1.57" "1.58.0" "1.58" "1.59.0" "1.59" + "1.60.0" "1.60" "1.61.0" "1.61" "1.62.0" "1.62" "1.63.0" "1.63" "1.64.0" "1.64" + "1.65.0" "1.65" "1.66.0" "1.66" "1.67.0" "1.67" "1.68.0" "1.68" "1.69.0" "1.69" +) + +# Boost 1.52 disabled, see https://svn.boost.org/trac/boost/ticket/7669 +# Similar problems with Boost 1.46 and 1.47. + +OPTION(ENABLE_BAD_BOOST "Enable known bad versions of Boost" OFF) +if(ENABLE_BAD_BOOST) + MESSAGE(STATUS "Enabling use of known bad versions of Boost.") +endif(ENABLE_BAD_BOOST) + +# For any unsuitable Boost version, add the version number below in +# the following format: XXYYZZ +# Where: +# XX is the major version ('10' for version 1) +# YY is the minor version number ('46' for 1.46) +# ZZ is the patcher version number (typically just '00') +set(Boost_NOGO_VERSIONS + 104600 104601 104700 105200 + ) + +foreach(ver ${Boost_NOGO_VERSIONS}) + if(${Boost_VERSION} EQUAL ${ver}) + if(NOT ENABLE_BAD_BOOST) + MESSAGE(STATUS "WARNING: Found a known bad version of Boost (v${Boost_VERSION}). Disabling.") + set(Boost_FOUND FALSE) + else(NOT ENABLE_BAD_BOOST) + MESSAGE(STATUS "WARNING: Found a known bad version of Boost (v${Boost_VERSION}). Continuing anyway.") + set(Boost_FOUND TRUE) + endif(NOT ENABLE_BAD_BOOST) + endif(${Boost_VERSION} EQUAL ${ver}) +endforeach(ver) diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/VolkConfig.cmake b/src/algorithms/libs/volk_gnsssdr/cmake/VolkConfig.cmake new file mode 100644 index 000000000..7d58b1923 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/VolkConfig.cmake @@ -0,0 +1,26 @@ +INCLUDE(FindPkgConfig) +PKG_CHECK_MODULES(PC_VOLK volk_gnsssdr) + +FIND_PATH( + VOLK_INCLUDE_DIRS + NAMES volk_gnsssdr/volk_gnsssdr.h + HINTS $ENV{VOLK_DIR}/include + ${PC_VOLK_INCLUDEDIR} + PATHS /usr/local/include + /usr/include +) + +FIND_LIBRARY( + VOLK_LIBRARIES + NAMES volk_gnsssdr + HINTS $ENV{VOLK_DIR}/lib + ${PC_VOLK_LIBDIR} + PATHS /usr/local/lib + /usr/local/lib64 + /usr/lib + /usr/lib64 +) + +INCLUDE(FindPackageHandleStandardArgs) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS) +MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS) diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/msvc/config.h b/src/algorithms/libs/volk_gnsssdr/cmake/msvc/config.h new file mode 100644 index 000000000..43792c783 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/msvc/config.h @@ -0,0 +1,58 @@ +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_CONFIG_H_ // [ +#define _MSC_CONFIG_H_ + +//////////////////////////////////////////////////////////////////////// +// enable inline functions for C code +//////////////////////////////////////////////////////////////////////// +#ifndef __cplusplus +# define inline __inline +#endif + +//////////////////////////////////////////////////////////////////////// +// signed size_t +//////////////////////////////////////////////////////////////////////// +#include +typedef ptrdiff_t ssize_t; + +//////////////////////////////////////////////////////////////////////// +// rint functions +//////////////////////////////////////////////////////////////////////// +#include +static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);} +static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);} +static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);} +static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);} +static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);} +static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);} + +//////////////////////////////////////////////////////////////////////// +// math constants +//////////////////////////////////////////////////////////////////////// +#define INFINITY HUGE_VAL + +# define M_E 2.7182818284590452354 /* e */ +# define M_LOG2E 1.4426950408889634074 /* log_2 e */ +# define M_LOG10E 0.43429448190325182765 /* log_10 e */ +# define M_LN2 0.69314718055994530942 /* log_e 2 */ +# define M_LN10 2.30258509299404568402 /* log_e 10 */ +# define M_PI 3.14159265358979323846 /* pi */ +# define M_PI_2 1.57079632679489661923 /* pi/2 */ +# define M_PI_4 0.78539816339744830962 /* pi/4 */ +# define M_1_PI 0.31830988618379067154 /* 1/pi */ +# define M_2_PI 0.63661977236758134308 /* 2/pi */ +# define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */ +# define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ +# define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ + +//////////////////////////////////////////////////////////////////////// +// random and srandom +//////////////////////////////////////////////////////////////////////// +#include +static inline long int random (void) { return rand(); } +static inline void srandom (unsigned int seed) { srand(seed); } + +#endif // _MSC_CONFIG_H_ ] diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/msvc/inttypes.h b/src/algorithms/libs/volk_gnsssdr/cmake/msvc/inttypes.h new file mode 100644 index 000000000..0a1b60fc1 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/msvc/inttypes.h @@ -0,0 +1,301 @@ +// ISO C9x compliant inttypes.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_INTTYPES_H_ // [ +#define _MSC_INTTYPES_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include + +// 7.8 Format conversion of integer types + +typedef struct { + intmax_t quot; + intmax_t rem; +} imaxdiv_t; + +// 7.8.1 Macros for format specifiers + +// The fprintf macros for signed integers are: +#define PRId8 "d" +#define PRIi8 "i" +#define PRIdLEAST8 "d" +#define PRIiLEAST8 "i" +#define PRIdFAST8 "d" +#define PRIiFAST8 "i" + +#define PRId16 "hd" +#define PRIi16 "hi" +#define PRIdLEAST16 "hd" +#define PRIiLEAST16 "hi" +#define PRIdFAST16 "hd" +#define PRIiFAST16 "hi" + +#define PRId32 "I32d" +#define PRIi32 "I32i" +#define PRIdLEAST32 "I32d" +#define PRIiLEAST32 "I32i" +#define PRIdFAST32 "I32d" +#define PRIiFAST32 "I32i" + +#define PRId64 "I64d" +#define PRIi64 "I64i" +#define PRIdLEAST64 "I64d" +#define PRIiLEAST64 "I64i" +#define PRIdFAST64 "I64d" +#define PRIiFAST64 "I64i" + +#define PRIdMAX "I64d" +#define PRIiMAX "I64i" + +#define PRIdPTR "Id" +#define PRIiPTR "Ii" + +// The fprintf macros for unsigned integers are: +#define PRIo8 "o" +#define PRIu8 "u" +#define PRIx8 "x" +#define PRIX8 "X" +#define PRIoLEAST8 "o" +#define PRIuLEAST8 "u" +#define PRIxLEAST8 "x" +#define PRIXLEAST8 "X" +#define PRIoFAST8 "o" +#define PRIuFAST8 "u" +#define PRIxFAST8 "x" +#define PRIXFAST8 "X" + +#define PRIo16 "ho" +#define PRIu16 "hu" +#define PRIx16 "hx" +#define PRIX16 "hX" +#define PRIoLEAST16 "ho" +#define PRIuLEAST16 "hu" +#define PRIxLEAST16 "hx" +#define PRIXLEAST16 "hX" +#define PRIoFAST16 "ho" +#define PRIuFAST16 "hu" +#define PRIxFAST16 "hx" +#define PRIXFAST16 "hX" + +#define PRIo32 "I32o" +#define PRIu32 "I32u" +#define PRIx32 "I32x" +#define PRIX32 "I32X" +#define PRIoLEAST32 "I32o" +#define PRIuLEAST32 "I32u" +#define PRIxLEAST32 "I32x" +#define PRIXLEAST32 "I32X" +#define PRIoFAST32 "I32o" +#define PRIuFAST32 "I32u" +#define PRIxFAST32 "I32x" +#define PRIXFAST32 "I32X" + +#define PRIo64 "I64o" +#define PRIu64 "I64u" +#define PRIx64 "I64x" +#define PRIX64 "I64X" +#define PRIoLEAST64 "I64o" +#define PRIuLEAST64 "I64u" +#define PRIxLEAST64 "I64x" +#define PRIXLEAST64 "I64X" +#define PRIoFAST64 "I64o" +#define PRIuFAST64 "I64u" +#define PRIxFAST64 "I64x" +#define PRIXFAST64 "I64X" + +#define PRIoMAX "I64o" +#define PRIuMAX "I64u" +#define PRIxMAX "I64x" +#define PRIXMAX "I64X" + +#define PRIoPTR "Io" +#define PRIuPTR "Iu" +#define PRIxPTR "Ix" +#define PRIXPTR "IX" + +// The fscanf macros for signed integers are: +#define SCNd8 "d" +#define SCNi8 "i" +#define SCNdLEAST8 "d" +#define SCNiLEAST8 "i" +#define SCNdFAST8 "d" +#define SCNiFAST8 "i" + +#define SCNd16 "hd" +#define SCNi16 "hi" +#define SCNdLEAST16 "hd" +#define SCNiLEAST16 "hi" +#define SCNdFAST16 "hd" +#define SCNiFAST16 "hi" + +#define SCNd32 "ld" +#define SCNi32 "li" +#define SCNdLEAST32 "ld" +#define SCNiLEAST32 "li" +#define SCNdFAST32 "ld" +#define SCNiFAST32 "li" + +#define SCNd64 "I64d" +#define SCNi64 "I64i" +#define SCNdLEAST64 "I64d" +#define SCNiLEAST64 "I64i" +#define SCNdFAST64 "I64d" +#define SCNiFAST64 "I64i" + +#define SCNdMAX "I64d" +#define SCNiMAX "I64i" + +#ifdef _WIN64 // [ +# define SCNdPTR "I64d" +# define SCNiPTR "I64i" +#else // _WIN64 ][ +# define SCNdPTR "ld" +# define SCNiPTR "li" +#endif // _WIN64 ] + +// The fscanf macros for unsigned integers are: +#define SCNo8 "o" +#define SCNu8 "u" +#define SCNx8 "x" +#define SCNX8 "X" +#define SCNoLEAST8 "o" +#define SCNuLEAST8 "u" +#define SCNxLEAST8 "x" +#define SCNXLEAST8 "X" +#define SCNoFAST8 "o" +#define SCNuFAST8 "u" +#define SCNxFAST8 "x" +#define SCNXFAST8 "X" + +#define SCNo16 "ho" +#define SCNu16 "hu" +#define SCNx16 "hx" +#define SCNX16 "hX" +#define SCNoLEAST16 "ho" +#define SCNuLEAST16 "hu" +#define SCNxLEAST16 "hx" +#define SCNXLEAST16 "hX" +#define SCNoFAST16 "ho" +#define SCNuFAST16 "hu" +#define SCNxFAST16 "hx" +#define SCNXFAST16 "hX" + +#define SCNo32 "lo" +#define SCNu32 "lu" +#define SCNx32 "lx" +#define SCNX32 "lX" +#define SCNoLEAST32 "lo" +#define SCNuLEAST32 "lu" +#define SCNxLEAST32 "lx" +#define SCNXLEAST32 "lX" +#define SCNoFAST32 "lo" +#define SCNuFAST32 "lu" +#define SCNxFAST32 "lx" +#define SCNXFAST32 "lX" + +#define SCNo64 "I64o" +#define SCNu64 "I64u" +#define SCNx64 "I64x" +#define SCNX64 "I64X" +#define SCNoLEAST64 "I64o" +#define SCNuLEAST64 "I64u" +#define SCNxLEAST64 "I64x" +#define SCNXLEAST64 "I64X" +#define SCNoFAST64 "I64o" +#define SCNuFAST64 "I64u" +#define SCNxFAST64 "I64x" +#define SCNXFAST64 "I64X" + +#define SCNoMAX "I64o" +#define SCNuMAX "I64u" +#define SCNxMAX "I64x" +#define SCNXMAX "I64X" + +#ifdef _WIN64 // [ +# define SCNoPTR "I64o" +# define SCNuPTR "I64u" +# define SCNxPTR "I64x" +# define SCNXPTR "I64X" +#else // _WIN64 ][ +# define SCNoPTR "lo" +# define SCNuPTR "lu" +# define SCNxPTR "lx" +# define SCNXPTR "lX" +#endif // _WIN64 ] + +// 7.8.2 Functions for greatest-width integer types + +// 7.8.2.1 The imaxabs function +#define imaxabs _abs64 + +// 7.8.2.2 The imaxdiv function + +// This is modified version of div() function from Microsoft's div.c found +// in %MSVC.NET%\crt\src\div.c +#ifdef STATIC_IMAXDIV // [ +static +#else // STATIC_IMAXDIV ][ +_inline +#endif // STATIC_IMAXDIV ] +imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) +{ + imaxdiv_t result; + + result.quot = numer / denom; + result.rem = numer % denom; + + if (numer < 0 && result.rem > 0) { + // did division wrong; must fix up + ++result.quot; + result.rem -= denom; + } + + return result; +} + +// 7.8.2.3 The strtoimax and strtoumax functions +#define strtoimax _strtoi64 +#define strtoumax _strtoui64 + +// 7.8.2.4 The wcstoimax and wcstoumax functions +#define wcstoimax _wcstoi64 +#define wcstoumax _wcstoui64 + + +#endif // _MSC_INTTYPES_H_ ] diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/msvc/stdbool.h b/src/algorithms/libs/volk_gnsssdr/cmake/msvc/stdbool.h new file mode 100644 index 000000000..ca4581d37 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/msvc/stdbool.h @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2005, 2006 Apple Computer, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public License + * along with this library; see the file COPYING.LIB. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#ifndef STDBOOL_WIN32_H +#define STDBOOL_WIN32_H + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef __cplusplus + +typedef unsigned char bool; + +#define true 1 +#define false 0 + +#ifndef CASSERT +#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1]; +#endif + +CASSERT(sizeof(bool) == 1, bool_is_one_byte) +CASSERT(true, true_is_true) +CASSERT(!false, false_is_false) + +#endif + +#endif diff --git a/src/algorithms/libs/volk_gnsssdr/cmake/msvc/stdint.h b/src/algorithms/libs/volk_gnsssdr/cmake/msvc/stdint.h new file mode 100644 index 000000000..108bc8982 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/cmake/msvc/stdint.h @@ -0,0 +1,251 @@ +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2008 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we should wrap include with 'extern "C++" {}' +// or compiler give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#ifdef __cplusplus +extern "C" { +#endif +# include +#ifdef __cplusplus +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +#ifndef INTMAX_C +#define INTMAX_C INT64_C +#endif +#ifndef UINTMAX_C +#define UINTMAX_C UINT64_C +#endif + +#endif // __STDC_CONSTANT_MACROS ] + + +#endif // _MSC_STDINT_H_ ] diff --git a/src/algorithms/libs/volk_gnsssdr/gen/archs.xml b/src/algorithms/libs/volk_gnsssdr/gen/archs.xml new file mode 100644 index 000000000..e570fe5d2 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/gen/archs.xml @@ -0,0 +1,204 @@ + + + + + + + + -maltivec + 16 + + + + + -mfloat-abi=softfp + + + + -mfloat-abi=hard + + + + -mfpu=neon + -funsafe-math-optimizations + 16 + + + + + -m32 + + + + + 0x80000001 + + + 3 + 0x80000001 + 29 + + -m64 + -m64 + + + + + 3 + 0x80000001 + 31 + + -m3dnow + -m3dnow + 8 + + + + + 3 + 0x80000001 + 5 + + -msse4.2 + -msse4.2 + 16 + + + + + 2 + 0x00000001 + 23 + + -mpopcnt + -mpopcnt + /arch:AVX + + + + + 3 + 0x00000001 + 23 + + -mmmx + -mmmx + /arch:SSE + 8 + + + + + 3 + 0x00000001 + 25 + + -msse + -msse + /arch:SSE + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + xmmintrin.h + 16 + + + + + 3 + 0x00000001 + 26 + + -msse2 + -msse2 + /arch:SSE2 + 16 + + + + + + + + + + + + 2 + 0x00000001 + 0 + + -msse3 + -msse3 + /arch:AVX + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + pmmintrin.h + 16 + + + + + 2 + 0x00000001 + 9 + + -mssse3 + -mssse3 + /arch:AVX + 16 + + + + + 2 + 0x80000001 + 6 + + -msse4a + -msse4a + 16 + + + + + 2 + 0x00000001 + 19 + + -msse4.1 + -msse4.1 + /arch:AVX + 16 + + + + + 2 + 0x00000001 + 20 + + -msse4.2 + -msse4.2 + /arch:AVX + 16 + + + + + 2 + 0x00000001 + 28 + + + + 2 + 0x00000001 + 27 + + + + -mavx + -mavx + /arch:AVX + 32 + + + diff --git a/src/algorithms/libs/volk_gnsssdr/gen/machines.xml b/src/algorithms/libs/volk_gnsssdr/gen/machines.xml new file mode 100644 index 000000000..357bf7519 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/gen/machines.xml @@ -0,0 +1,55 @@ + + + +generic orc| + + + + + +generic neon softfp|hardfp orc| + + + + +generic 32|64| mmx| sse sse2 orc| + + + +generic 32|64 mmx sse sse2 sse3 orc| + + + +generic 32|64 mmx sse sse2 sse3 ssse3 orc| + + + +generic 32|64 mmx sse sse2 sse3 sse4_a popcount orc| + + + +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 orc| + + + +generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc| + + + + +generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc| + + + +generic altivec + + + diff --git a/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_arch_defs.py b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_arch_defs.py new file mode 100644 index 000000000..3c75e1374 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_arch_defs.py @@ -0,0 +1,85 @@ +# +# Copyright 2012 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +archs = list() +arch_dict = dict() + +class arch_class: + def __init__(self, flags, checks, **kwargs): + for key, cast, failval in ( + ('name', str, None), + ('environment', str, None), + ('include', str, None), + ('alignment', int, 1) + ): + try: setattr(self, key, cast(kwargs[key])) + except: setattr(self, key, failval) + self.checks = checks + assert(self.name) + self._flags = flags + + def is_supported(self, compiler): + if not self._flags.keys(): return True + return compiler in self._flags.keys() + + def get_flags(self, compiler): + try: return self._flags[compiler] + except KeyError: return list() + + def __repr__(self): return self.name + +def register_arch(**kwargs): + arch = arch_class(**kwargs) + archs.append(arch) + arch_dict[arch.name] = arch + +######################################################################## +# register the arches +######################################################################## +#TODO skip the XML and put it here +from xml.dom import minidom +import os +gendir = os.path.dirname(__file__) +archs_xml = minidom.parse(os.path.join(gendir, 'archs.xml')).getElementsByTagName('arch') +for arch_xml in archs_xml: + kwargs = dict() + for attr in arch_xml.attributes.keys(): + kwargs[attr] = arch_xml.attributes[attr].value + for node in arch_xml.childNodes: + try: + name = node.tagName + val = arch_xml.getElementsByTagName(name)[0].firstChild.data + kwargs[name] = val + except: pass + checks = list() + for check_xml in arch_xml.getElementsByTagName("check"): + name = check_xml.attributes["name"].value + params = list() + for param_xml in check_xml.getElementsByTagName("param"): + params.append(param_xml.firstChild.data) + checks.append([name, params]) + flags = dict() + for flag_xml in arch_xml.getElementsByTagName("flag"): + name = flag_xml.attributes["compiler"].value + if not flags.has_key(name): flags[name] = list() + flags[name].append(flag_xml.firstChild.data) + #force kwargs keys to be of type str, not unicode for py25 + kwargs = dict((str(k), v) for k, v in kwargs.iteritems()) + register_arch(flags=flags, checks=checks, **kwargs) + +if __name__ == '__main__': + print archs diff --git a/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_compile_utils.py b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_compile_utils.py new file mode 100644 index 000000000..05de9a546 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_compile_utils.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python +# +# Copyright 2012 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +import optparse +import volk_gnsssdr_arch_defs +import volk_gnsssdr_machine_defs + +def do_arch_flags_list(compiler): + output = list() + for arch in volk_gnsssdr_arch_defs.archs: + if not arch.is_supported(compiler): continue + fields = [arch.name] + arch.get_flags(compiler) + output.append(','.join(fields)) + print ';'.join(output) + +def do_machines_list(arch_names): + output = list() + for machine in volk_gnsssdr_machine_defs.machines: + machine_arch_set = set(machine.arch_names) + if set(arch_names).intersection(machine_arch_set) == machine_arch_set: + output.append(machine.name) + print ';'.join(output) + +def do_machine_flags_list(compiler, machine_name): + output = list() + machine = volk_gnsssdr_machine_defs.machine_dict[machine_name] + for arch in machine.archs: + output.extend(arch.get_flags(compiler)) + print ' '.join(output) + +def main(): + parser = optparse.OptionParser() + parser.add_option('--mode', type='string') + parser.add_option('--compiler', type='string') + parser.add_option('--archs', type='string') + parser.add_option('--machine', type='string') + (opts, args) = parser.parse_args() + + if opts.mode == 'arch_flags': return do_arch_flags_list(opts.compiler.lower()) + if opts.mode == 'machines': return do_machines_list(opts.archs.split(';')) + if opts.mode == 'machine_flags': return do_machine_flags_list(opts.compiler.lower(), opts.machine) + +if __name__ == '__main__': main() diff --git a/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_kernel_defs.py b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_kernel_defs.py new file mode 100644 index 000000000..b3f03f627 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_kernel_defs.py @@ -0,0 +1,209 @@ +# +# Copyright 2011-2012 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +import os +import re +import sys +import glob + +######################################################################## +# Strip comments from a c/cpp file. +# Input is code string, output is code string without comments. +# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments +######################################################################## +def comment_remover(text): + def replacer(match): + s = match.group(0) + if s.startswith('/'): + return "" + else: + return s + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE + ) + return re.sub(pattern, replacer, text) + +######################################################################## +# Split code into nested sections according to ifdef preprocessor macros +######################################################################## +def split_into_nested_ifdef_sections(code): + sections = list() + section = '' + header = 'text' + in_section_depth = 0 + for i, line in enumerate(code.splitlines()): + m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line) + line_is = 'normal' + if m: + p0, p1, fcn, stuff = m.groups() + if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if' + if fcn in ('else', 'elif'): line_is = 'else' + if fcn in ('endif',): line_is = 'end' + + if line_is == 'if': in_section_depth += 1 + if line_is == 'end': in_section_depth -= 1 + + if in_section_depth == 1 and line_is == 'if': + sections.append((header, section)) + section = '' + header = line + continue + + if in_section_depth == 1 and line_is == 'else': + sections.append((header, section)) + section = '' + header = line + continue + + if in_section_depth == 0 and line_is == 'end': + sections.append((header, section)) + section = '' + header = 'text' + continue + + section += line + '\n' + + sections.append((header, section)) #and pack remainder into sections + sections = [sec for sec in sections if sec[1].strip()] #filter empty sections + + #recurse into non-text sections to fill subsections + for i, (header, section) in enumerate(sections): + if header == 'text': continue + sections[i] = (header, split_into_nested_ifdef_sections(section)) + + return sections + +######################################################################## +# Recursive print of sections to test code above +######################################################################## +def print_sections(sections, indent = ' '): + for header, body in sections: + if header == 'text': + print indent, ('\n'+indent).join(body.splitlines()) + continue + print indent.replace(' ', '-') + '>', header + print_sections(body, indent + ' ') + +######################################################################## +# Flatten a section to just body text +######################################################################## +def flatten_section_text(sections): + output = '' + for hdr, bdy in sections: + if hdr != 'text': output += flatten_section_text(bdy) + else: output += bdy + return output + +######################################################################## +# Extract kernel info from section, represent as an implementation +######################################################################## +class impl_class: + def __init__(self, kern_name, header, body): + #extract LV_HAVE_* + self.deps = set(map(str.lower, re.findall('LV_HAVE_(\w+)', header))) + #extract function suffix and args + body = flatten_section_text(body) + try: + fcn_matcher = re.compile('^.*(%s\\w*)\\s*\\((.*)$'%kern_name, re.DOTALL | re.MULTILINE) + body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket + m = fcn_matcher.match(body) + impl_name, the_rest = m.groups() + self.name = impl_name.replace(kern_name+'_', '') + self.args = list() + fcn_args = the_rest.split(',') + for fcn_arg in fcn_args: + arg_matcher = re.compile('^\s*(.*\\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE) + m = arg_matcher.match(fcn_arg) + arg_type, arg_name = m.groups() + self.args.append((arg_type, arg_name)) + except Exception as ex: + raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex) + + assert self.name + self.is_aligned = self.name.startswith('a_') + + def __repr__(self): + return self.name + +######################################################################## +# Get sets of LV_HAVE_* from the code +######################################################################## +def extract_lv_haves(code): + haves = list() + for line in code.splitlines(): + if not line.strip().startswith('#'): continue + have_set = set(map(str.lower, re.findall('LV_HAVE_(\w+)', line))) + if have_set: haves.append(have_set) + return haves + +######################################################################## +# Represent a processing kernel, parse from file +######################################################################## +class kernel_class: + def __init__(self, kernel_file): + self.name = os.path.splitext(os.path.basename(kernel_file))[0] + self.pname = self.name.replace('volk_gnsssdr_', 'p_') + code = open(kernel_file, 'r').read() + code = comment_remover(code) + sections = split_into_nested_ifdef_sections(code) + self._impls = list() + for header, section in sections: + if 'ifndef' not in header.lower(): continue + for sub_hdr, body in section: + if 'if' not in sub_hdr.lower(): continue + if 'LV_HAVE_' not in sub_hdr: continue + self._impls.append(impl_class( + kern_name=self.name, header=sub_hdr, body=body, + )) + assert(self._impls) + self.has_dispatcher = False + for impl in self._impls: + if impl.name == 'dispatcher': + self._impls.remove(impl) + self.has_dispatcher = True + break + self.args = self._impls[0].args + self.arglist_types = ', '.join([a[0] for a in self.args]) + self.arglist_full = ', '.join(['%s %s'%a for a in self.args]) + self.arglist_names = ', '.join([a[1] for a in self.args]) + + def get_impls(self, archs): + archs = set(archs) + impls = list() + for impl in self._impls: + if impl.deps.intersection(archs) == impl.deps: + impls.append(impl) + return impls + + def __repr__(self): + return self.name + +######################################################################## +# Extract information from the VOLK kernels +######################################################################## +__file__ = os.path.abspath(__file__) +srcdir = os.path.dirname(os.path.dirname(__file__)) +kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk_gnsssdr", "*.h")) +kernels = map(kernel_class, kernel_files) + +if __name__ == '__main__': + print kernels diff --git a/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_machine_defs.py b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_machine_defs.py new file mode 100644 index 000000000..174106634 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_machine_defs.py @@ -0,0 +1,74 @@ +# +# Copyright 2012 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +from volk_gnsssdr_arch_defs import arch_dict + +machines = list() +machine_dict = dict() + +class machine_class: + def __init__(self, name, archs): + self.name = name + self.archs = list() + self.arch_names = list() + for arch_name in archs: + if not arch_name: continue + arch = arch_dict[arch_name] + self.archs.append(arch) + self.arch_names.append(arch_name) + self.alignment = max(map(lambda a: a.alignment, self.archs)) + + def __repr__(self): return self.name + +def register_machine(name, archs): + for i, arch_name in enumerate(archs): + if '|' in arch_name: #handle special arch names with the '|' + for arch_sub in arch_name.split('|'): + if arch_sub: + register_machine(name+'_'+arch_sub, archs[:i] + [arch_sub] + archs[i+1:]) + else: + register_machine(name, archs[:i] + archs[i+1:]) + return + machine = machine_class(name=name, archs=archs) + machines.append(machine) + machine_dict[machine.name] = machine + +######################################################################## +# register the machines +######################################################################## +#TODO skip the XML and put it here +from xml.dom import minidom +import os +gendir = os.path.dirname(__file__) +machines_xml = minidom.parse(os.path.join(gendir, 'machines.xml')).getElementsByTagName('machine') +for machine_xml in machines_xml: + kwargs = dict() + for attr in machine_xml.attributes.keys(): + kwargs[attr] = machine_xml.attributes[attr].value + for node in machine_xml.childNodes: + try: + name = node.tagName + val = machine_xml.getElementsByTagName(name)[0].firstChild.data + kwargs[name] = val + except: pass + kwargs['archs'] = kwargs['archs'].split() + #force kwargs keys to be of type str, not unicode for py25 + kwargs = dict((str(k), v) for k, v in kwargs.iteritems()) + register_machine(**kwargs) + +if __name__ == '__main__': + print machines diff --git a/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_tmpl_utils.py b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_tmpl_utils.py new file mode 100644 index 000000000..c4577af62 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/gen/volk_gnsssdr_tmpl_utils.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python +# +# Copyright 2012 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +import os +import re +import sys +import optparse +import volk_gnsssdr_arch_defs +import volk_gnsssdr_machine_defs +import volk_gnsssdr_kernel_defs +from Cheetah import Template + +def __escape_pre_processor(code): + out = list() + for line in code.splitlines(): + m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line) + if m: + p0, p1, fcn, stuff = m.groups() + conly = fcn in ('include', 'define', 'ifdef', 'ifndef', 'endif', 'elif', 'pragma') + both = fcn in ('if', 'else') + istmpl = '$' in stuff + if 'defined' in stuff: istmpl = False + if conly or (both and not istmpl): + line = '%s\\#%s%s%s'%(p0, p1, fcn, stuff) + out.append(line) + return '\n'.join(out) + +def __parse_tmpl(_tmpl, **kwargs): + defs = { + 'archs': volk_gnsssdr_arch_defs.archs, + 'arch_dict': volk_gnsssdr_arch_defs.arch_dict, + 'machines': volk_gnsssdr_machine_defs.machines, + 'machine_dict': volk_gnsssdr_machine_defs.machine_dict, + 'kernels': volk_gnsssdr_kernel_defs.kernels, + } + defs.update(kwargs) + _tmpl = __escape_pre_processor(_tmpl) + _tmpl = """ + +/* this file was generated by volk_gnsssdr template utils, do not edit! */ + +""" + _tmpl + return str(Template.Template(_tmpl, defs)) + +def main(): + parser = optparse.OptionParser() + parser.add_option('--input', type='string') + parser.add_option('--output', type='string') + (opts, args) = parser.parse_args() + + output = __parse_tmpl(open(opts.input).read(), args=args) + if opts.output: open(opts.output, 'w').write(output) + else: print output + +if __name__ == '__main__': main() diff --git a/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/constants.h b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/constants.h new file mode 100644 index 000000000..f08960557 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/constants.h @@ -0,0 +1,39 @@ +/* -*- c++ -*- */ +/* + * Copyright 2006,2009,2013 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_CONSTANTS_H +#define INCLUDED_VOLK_CONSTANTS_H + +#include + +__VOLK_DECL_BEGIN + +VOLK_API char* volk_gnsssdr_prefix(); +VOLK_API char* volk_gnsssdr_build_date(); +VOLK_API char* volk_gnsssdr_version(); +VOLK_API char* volk_gnsssdr_c_compiler(); +VOLK_API char* volk_gnsssdr_compiler_flags(); +VOLK_API char* volk_gnsssdr_available_machines(); + +__VOLK_DECL_END + +#endif /* INCLUDED_VOLK_CONSTANTS_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h new file mode 100644 index 000000000..c48057cd9 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h @@ -0,0 +1,96 @@ +#ifndef INCLUDED_LIBVOLK_COMMON_H +#define INCLUDED_LIBVOLK_COMMON_H + +//////////////////////////////////////////////////////////////////////// +// Cross-platform attribute macros +//////////////////////////////////////////////////////////////////////// +#if defined __GNUC__ +# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +# define __VOLK_ATTR_UNUSED __attribute__((unused)) +# define __VOLK_ATTR_INLINE __attribute__((always_inline)) +# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +# if __GNUC__ >= 4 +# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) +# else +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +# endif +#elif _MSC_VER +# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE __forceinline +# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +# define __VOLK_ATTR_EXPORT __declspec(dllexport) +# define __VOLK_ATTR_IMPORT __declspec(dllimport) +#else +# define __VOLK_ATTR_ALIGNED(x) +# define __VOLK_ATTR_UNUSED +# define __VOLK_ATTR_INLINE +# define __VOLK_ATTR_DEPRECATED +# define __VOLK_ATTR_EXPORT +# define __VOLK_ATTR_IMPORT +#endif + +//////////////////////////////////////////////////////////////////////// +// Ignore annoying warnings in MSVC +//////////////////////////////////////////////////////////////////////// +#if defined(_MSC_VER) +# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data +# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' +#endif + +//////////////////////////////////////////////////////////////////////// +// C-linkage declaration macros +// FIXME: due to the usage of complex.h, require gcc for c-linkage +//////////////////////////////////////////////////////////////////////// +#if defined(__cplusplus) && (__GNUC__) +# define __VOLK_DECL_BEGIN extern "C" { +# define __VOLK_DECL_END } +#else +# define __VOLK_DECL_BEGIN +# define __VOLK_DECL_END +#endif + +//////////////////////////////////////////////////////////////////////// +// Define VOLK_API for library symbols +// http://gcc.gnu.org/wiki/Visibility +//////////////////////////////////////////////////////////////////////// +#ifdef volk_gnsssdr_EXPORTS +# define VOLK_API __VOLK_ATTR_EXPORT +#else +# define VOLK_API __VOLK_ATTR_IMPORT +#endif + +//////////////////////////////////////////////////////////////////////// +// The bit128 union used by some +//////////////////////////////////////////////////////////////////////// +#include + +#ifdef LV_HAVE_SSE +#include +#endif + +#ifdef LV_HAVE_SSE2 +#include +#endif + +union bit128{ + uint16_t i16[8]; + uint32_t i[4]; + float f[4]; + double d[2]; + + #ifdef LV_HAVE_SSE + __m128 float_vec; + #endif + + #ifdef LV_HAVE_SSE2 + __m128i int_vec; + __m128d double_vec; + #endif +}; + +#define bit128_p(x) ((union bit128 *)(x)) + +#endif /*INCLUDED_LIBVOLK_COMMON_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h new file mode 100644 index 000000000..5bd925044 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h @@ -0,0 +1,86 @@ +#ifndef INCLUDE_VOLK_COMPLEX_H +#define INCLUDE_VOLK_COMPLEX_H + +/*! + * \brief Provide typedefs and operators for all complex types in C and C++. + * + * The typedefs encompass all signed integer and floating point types. + * Each operator function is intended to work across all data types. + * Under C++, these operators are defined as inline templates. + * Under C, these operators are defined as preprocessor macros. + * The use of macros makes the operators agnostic to the type. + * + * The following operator functions are defined: + * - lv_cmake - make a complex type from components + * - lv_creal - get the real part of the complex number + * - lv_cimag - get the imaginary part of the complex number + * - lv_conj - take the conjugate of the complex number + */ + +#ifdef __cplusplus + +#include +#include + +typedef std::complex lv_8sc_t; +typedef std::complex lv_16sc_t; +typedef std::complex lv_32sc_t; +typedef std::complex lv_64sc_t; +typedef std::complex lv_32fc_t; +typedef std::complex lv_64fc_t; + +template inline std::complex lv_cmake(const T &r, const T &i){ + return std::complex(r, i); +} + +template inline typename T::value_type lv_creal(const T &x){ + return x.real(); +} + +template inline typename T::value_type lv_cimag(const T &x){ + return x.imag(); +} + +template inline T lv_conj(const T &x){ + return std::conj(x); +} + +#else /* __cplusplus */ + +#include + +typedef char complex lv_8sc_t; +typedef short complex lv_16sc_t; +typedef long complex lv_32sc_t; +typedef long long complex lv_64sc_t; +typedef float complex lv_32fc_t; +typedef double complex lv_64fc_t; + +#define lv_cmake(r, i) ((r) + _Complex_I*(i)) + +// When GNUC is available, use the complex extensions. +// The extensions always return the correct value type. +// http://gcc.gnu.org/onlinedocs/gcc/Complex.html +#ifdef __GNUC__ + +#define lv_creal(x) (__real__(x)) + +#define lv_cimag(x) (__imag__(x)) + +#define lv_conj(x) (~(x)) + +// When not available, use the c99 complex function family, +// which always returns double regardless of the input type. +#else /* __GNUC__ */ + +#define lv_creal(x) (creal(x)) + +#define lv_cimag(x) (cimag(x)) + +#define lv_conj(x) (conj(x)) + +#endif /* __GNUC__ */ + +#endif /* __cplusplus */ + +#endif /* INCLUDE_VOLK_COMPLEX_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_malloc.h b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_malloc.h new file mode 100644 index 000000000..7136bc135 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_malloc.h @@ -0,0 +1,66 @@ +/* -*- c -*- */ +/* + * Copyright 2014 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_MALLOC_H +#define INCLUDED_VOLK_MALLOC_H + +#include +#include + +__VOLK_DECL_BEGIN + +/*! + * \brief Allocate \p size bytes of data aligned to \p alignment. + * + * \details + * Because we don't have a standard method to allocate buffers in + * memory that are guaranteed to be on an alignment, VOLK handles this + * itself. The volk_gnsssdr_malloc function behaves like malloc in that it + * returns a pointer to the allocated memory. However, it also takes + * in an alignment specfication, which is usually something like 16 or + * 32 to ensure that the aligned memory is located on a particular + * byte boundary for use with SIMD. + * + * Internally, the volk_gnsssdr_malloc first checks if the compiler is C11 + * compliant and uses the new aligned_alloc method. If not, it checks + * if the system is POSIX compliant and uses posix_memalign. If that + * fails, volk_gnsssdr_malloc handles the memory allocation and alignment + * internally. + * + * Because of the ways in which volk_gnsssdr_malloc may allocate memory, it is + * important to always free volk_gnsssdr_malloc pointers using volk_gnsssdr_free. + * + * \param size The number of bytes to allocate. + * \param alignment The byte alignment of the allocated memory. + * \return pointer to aligned memory. + */ +VOLK_API void *volk_gnsssdr_malloc(size_t size, size_t alignment); + +/*! + * \brief Free's memory allocated by volk_gnsssdr_malloc. + * \param aptr The aligned pointer allocaed by volk_gnsssdr_malloc. + */ +VOLK_API void volk_gnsssdr_free(void *aptr); + +__VOLK_DECL_END + +#endif /* INCLUDED_VOLK_MALLOC_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h new file mode 100644 index 000000000..6e13fc07a --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h @@ -0,0 +1,28 @@ +#ifndef INCLUDED_VOLK_PREFS_H +#define INCLUDED_VOLK_PREFS_H + +#include +#include + +__VOLK_DECL_BEGIN + +typedef struct volk_gnsssdr_arch_pref +{ + char name[128]; //name of the kernel + char impl_a[128]; //best aligned impl + char impl_u[128]; //best unaligned impl +} volk_gnsssdr_arch_pref_t; + +//////////////////////////////////////////////////////////////////////// +// get path to volk_gnsssdr_config profiling info +//////////////////////////////////////////////////////////////////////// +VOLK_API void volk_gnsssdr_get_config_path(char *); + +//////////////////////////////////////////////////////////////////////// +// load prefs into global prefs struct +//////////////////////////////////////////////////////////////////////// +VOLK_API size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **); + +__VOLK_DECL_END + +#endif //INCLUDED_VOLK_PREFS_H diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h new file mode 100644 index 000000000..ec9937ef5 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h @@ -0,0 +1,174 @@ +/*! + * \file CommonMacros.h + * \brief Common macros used inside the volk protokernels. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ +#ifndef INCLUDED_gnsssdr_CommonMacros_u_H +#define INCLUDED_gnsssdr_CommonMacros_u_H + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for U_SSE4_1 + */ + + #ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 + #define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\ + imag = _mm_srli_si128 (input1, 2);\ + imag = _mm_blend_epi16 (input2, imag, 85);\ + real = _mm_slli_si128 (input2, 2);\ + real = _mm_blend_epi16 (real, input1, 85); + #endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */ + + #ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 + #define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\ + input_i_1 = _mm_cvtepi16_epi32(input);\ + input = _mm_srli_si128 (input, 8);\ + input_i_2 = _mm_cvtepi16_epi32(input);\ + output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\ + output_ps = _mm_cvtepi32_ps(output_i32); + #endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ + + #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 + #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\ + input_i_1 = _mm_cvtepi8_epi32(input);\ + input = _mm_srli_si128 (input, 4);\ + input_i_2 = _mm_cvtepi8_epi32(input);\ + input = _mm_srli_si128 (input, 4);\ + output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\ + input_i_1 = _mm_cvtepi8_epi32(input);\ + input = _mm_srli_si128 (input, 4);\ + input_i_2 = _mm_cvtepi8_epi32(input);\ + input = _mm_srli_si128 (input, 4);\ + output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\ + output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\ + output_ps = _mm_cvtepi32_ps(output_i32); + #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_SSE2 + /*! + \brief Macros for U_SSE2 + */ + + #ifdef LV_HAVE_SSSE3 + /*! + \brief Macros for U_SSSE3 + */ + + #ifndef CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 + #define CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)\ + y_aux = _mm_sign_epi8 (y, x);\ + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);\ + real_output = _mm_maddubs_epi16 (x_abs, y_aux);\ + \ + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);\ + y_aux = _mm_sign_epi8 (y_aux, x);\ + imag_output = _mm_maddubs_epi16 (x_abs, y_aux); + #endif /* CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 */ + + #endif /* LV_HAVE_SSSE3 */ + + #ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 + #define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ + realx_mult_realy = _mm_mullo_epi16 (realx, realy);\ + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\ + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\ + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\ + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\ + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + #endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */ + + #ifndef CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 + #define CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(input, mult1, real, imag)\ + imag = _mm_srli_si128 (input, 1);\ + imag = _mm_and_si128 (imag, mult1);\ + real = _mm_and_si128 (input, mult1); + #endif /* CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 */ + + #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 + #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(input, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\ + input_i_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), input);\ + input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\ + input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\ + input_i_1 = _mm_srai_epi32(input_i_1, 24);\ + input_i_2 = _mm_srai_epi32(input_i_2, 24);\ + output_i32 = _mm_add_epi32(input_i_1, input_i_2);\ + output_ps_1 = _mm_cvtepi32_ps(output_i32);\ + \ + input_i_1 = _mm_unpackhi_epi8(_mm_setzero_si128(), input);\ + input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\ + input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\ + input_i_1 = _mm_srai_epi32(input_i_1, 24);\ + input_i_2 = _mm_srai_epi32(input_i_2, 24);\ + output_i32 = _mm_add_epi32(input_i_1, input_i_2);\ + output_ps_2 = _mm_cvtepi32_ps(output_i32); + #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 */ + + #ifndef CM_8IC_CONTROLMINUS128_8IC_U_SSE2 + #define CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\ + minus128control = _mm_cmpeq_epi8 (y, minus128);\ + y = _mm_sub_epi8 (y, minus128control); + #endif /* CM_8IC_CONTROLMINUS128_8IC_U_SSE2 */ + + #endif /* LV_HAVE_SSE2 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for U_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */ + + +#ifndef INCLUDED_gnsssdr_CommonMacros_a_H +#define INCLUDED_gnsssdr_CommonMacros_a_H + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for A_SSE4_1 + */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_SSE2 + /*! + \brief Macros for U_SSE2 + */ + + #endif /* LV_HAVE_SSE2 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for A_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h new file mode 100644 index 000000000..4fa054480 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h @@ -0,0 +1,76 @@ +/*! + * \file CommonMacros_16ic_cw_corr_32fc.h + * \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ +#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H +#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H +#include "CommonMacros/CommonMacros.h" + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for U_SSE4_1 + */ + + #ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 + #define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\ + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for U_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */ + + +#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H +#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for A_SSE4_1 + */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for A_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h new file mode 100644 index 000000000..a8a778057 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h @@ -0,0 +1,114 @@ +/*! + * \file CommonMacros_8ic_cw_corr_32fc.h + * \brief Common macros used inside the 8ic_cw_corr_32fc volk protokernels. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ +#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H +#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H +#include "CommonMacros/CommonMacros.h" + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for U_SSE4_1 + */ + + #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1 + #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\ + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\ + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ + \ + imag_output = _mm_slli_si128 (imag_output, 1);\ + output = _mm_blendv_epi8 (imag_output, real_output, mult1);\ + \ + CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ + + #ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 + #define CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ + CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\ + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\ + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + #endif /* CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 */ + + #ifndef CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 + #define CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\ + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + #endif /* CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_SSE2 + /*! + \brief Macros for U_SSE2 + */ + + #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 + #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\ + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\ + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ + \ + real_output = _mm_and_si128 (real_output, mult1);\ + imag_output = _mm_and_si128 (imag_output, mult1);\ + imag_output = _mm_slli_si128 (imag_output, 1);\ + output = _mm_or_si128 (real_output, imag_output);\ + \ + CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + #endif /* CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 */ + + #endif /* LV_HAVE_SSE2 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for U_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H */ + + +#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H +#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H + + #ifdef LV_HAVE_SSE4_1 + /*! + \brief Macros for A_SSE4_1 + */ + + #endif /* LV_HAVE_SSE4_1 */ + + #ifdef LV_HAVE_GENERIC + /*! + \brief Macros for A_GENERIC + */ + + #endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/README.txt b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/README.txt new file mode 100644 index 000000000..3d610256a --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/CommonMacros/README.txt @@ -0,0 +1,34 @@ +#################################################################### +Common Macros inside volk_gnsssdr module +#################################################################### + +First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties. +Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples. + +#################################################################### +Syntax +#################################################################### + +In order to allow better understanding of the code I created the macros with an specific syntax. + +1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example: + +example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + +First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output). +The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile. + +2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h + +#################################################################### +Workflow +#################################################################### + +In order to use the macros easily, I usually test the code without macros inside a testing proto-kernel, where you are able to test it, debug it and use breakpoints. +When it works I place code inside a macro an I test it again. + +#################################################################### +Why macros +#################################################################### +1) They are the only way I could find for sharing code between proto-kernels without performance penalty. +2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it. \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/README.txt b/src/algorithms/libs/volk_gnsssdr/kernels/README.txt new file mode 100644 index 000000000..69ee93d06 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/README.txt @@ -0,0 +1,67 @@ +######################################################################## +# How to create custom kernel dispatchers +######################################################################## +A kernel dispatcher is kernel implementation that calls other kernel implementations. +By default, a dispatcher is generated by the build system for every kernel such that: + * the best aligned implemention is called when all pointer arguments are aligned, + * and otherwise the best unaligned implementation is called. + +The author of a VOLK kernel may create a custom dispatcher, +to be called in place of the automatically generated one. +A custom dispatcher may be useful to handle head and tail cases, +or to implement different alignment and bounds checking logic. + +######################################################################## +# Code for an example dispatcher w/ tail case +######################################################################## +#include + +#ifdef LV_HAVE_DISPATCHER + +static inline void volk_gnsssdr_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points) +{ + const unsigned int num_points_r = num_points%4; + const unsigned int num_points_x = num_points - num_points_r; + + if (volk_gnsssdr_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector)))) + { + volk_gnsssdr_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x); + } + else + { + volk_gnsssdr_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x); + } + + volk_gnsssdr_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r); +} + +#endif //LV_HAVE_DISPATCHER + +######################################################################## +# Code for an example dispatcher w/ tail case and accumulator +######################################################################## +#include + +#ifdef LV_HAVE_DISPATCHER + +static inline void volk_gnsssdr_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points) +{ + const unsigned int num_points_r = num_points%16; + const unsigned int num_points_x = num_points - num_points_r; + + if (volk_gnsssdr_is_aligned(VOLK_OR_PTR(input, taps))) + { + volk_gnsssdr_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x); + } + else + { + volk_gnsssdr_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x); + } + + float result_tail = 0; + volk_gnsssdr_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r); + + *result += result_tail; +} + +#endif //LV_HAVE_DISPATCHER diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h new file mode 100644 index 000000000..ccb13171c --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h @@ -0,0 +1,241 @@ +#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H +#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + \note Output buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */ +#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H +#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H + +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int eighthPoints = num_points / 8; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128i inputVal; + __m128i inputVal2; + __m128 ret; + + for(;number < eighthPoints; number++){ + + // Load the 8 values + inputVal = _mm_loadu_si128((__m128i*)inputPtr); + + // Shift the input data to the right by 64 bits ( 8 bytes ) + inputVal2 = _mm_srli_si128(inputVal, 8); + + // Convert the lower 4 values into 32 bit words + inputVal = _mm_cvtepi16_epi32(inputVal); + inputVal2 = _mm_cvtepi16_epi32(inputVal2); + + ret = _mm_cvtepi32_ps(inputVal); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + outputVectorPtr += 4; + + ret = _mm_cvtepi32_ps(inputVal2); + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + outputVectorPtr += 4; + + inputPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + outputVector[number] =((float)(inputVector[number])) / scalar; + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE +#include + + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* outputVectorPtr = outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + int16_t* inputPtr = (int16_t*)inputVector; + __m128 ret; + + for(;number < quarterPoints; number++){ + ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); + + ret = _mm_mul_ps(ret, invScalar); + _mm_storeu_ps(outputVectorPtr, ret); + + inputPtr += 4; + outputVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + outputVector[number] = (float)(inputVector[number]) / scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value + \param inputVector The 16 bit input data buffer + \param outputVector The floating point output data buffer + \param scalar The value divided against each point in the output buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ + float* outputVectorPtr = outputVector; + const int16_t* inputVectorPtr = inputVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h new file mode 100644 index 000000000..95b0b093d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h @@ -0,0 +1,461 @@ +/*! + * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the + * real part and 16 bits the imaginary part): + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 32 bits vectors) It returns the input + * signal in base band (BB) + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 32 bits vectors), accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 32 bits vectors), accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 32 bits vectors), accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" + /*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + //Adds the float 32 results + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_load_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_load_si128((__m128i*)input_ptr); + + y1 = _mm_load_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_load_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_load_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + //Adds the float 32 results + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_load_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_load_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h new file mode 100644 index 000000000..34d1fd715 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h @@ -0,0 +1,1568 @@ +/*! + * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the + * real part and 16 bits the imaginary part): + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 32 bits vectors) It returns the input + * signal in base band (BB) + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 32 bits vectors), accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 32 bits vectors), accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 32 bits vectors), accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" + /*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_first(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 4; + + __m128i x, y, yaux, yl, yh, tmp1, tmp2, z, bb_signal_sample, bb_signal_sample_suffled; + + __m128 z_ps_1, z_ps_2, z_E, z_P, z_L; + __m128i z_i_1, z_i_2; + + lv_32fc_t dotProduct_E; + lv_32fc_t dotProduct_P; + lv_32fc_t dotProduct_L; + + z_E = _mm_setzero_ps(); + z_P = _mm_setzero_ps(); + z_L = _mm_setzero_ps(); + + const lv_16sc_t* _input = input; + const lv_16sc_t* _carrier = carrier; + const lv_16sc_t* _E_code = E_code; + const lv_16sc_t* _P_code = P_code; + const lv_16sc_t* _L_code = L_code; + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++) + { + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_lddqu_si128((__m128i*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + // Load yl with cr,cr,dr,dr + // Load yh with ci,ci,di,di + yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + yl = _mm_unpacklo_epi16(yaux, yaux); + yh = _mm_unpackhi_epi16(yaux, yaux); + + tmp1 = _mm_mullo_epi16(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_epi8 (x, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mullo_epi16(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); + bb_signal_sample = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_suffled = _mm_shuffle_epi8 (bb_signal_sample, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation E,P,L (3x vector scalar product) + // Early + y = _mm_lddqu_si128((__m128i*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + yl = _mm_unpacklo_epi16(yaux, yaux); + yh = _mm_unpackhi_epi16(yaux, yaux); + + tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); + z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_i_1 = _mm_cvtepi16_epi32(z); + z_ps_1 = _mm_cvtepi32_ps(z_i_1); + z = _mm_srli_si128 (z, 8); + z_i_2 = _mm_cvtepi16_epi32(z); + z_ps_2 = _mm_cvtepi32_ps(z_i_2); + + z_E = _mm_add_ps(z_E, z_ps_1); // Add the complex multiplication results together + z_E = _mm_add_ps(z_E, z_ps_2); // Add the complex multiplication results together + + // Prompt + y = _mm_lddqu_si128((__m128i*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + yl = _mm_unpacklo_epi16(yaux, yaux); + yh = _mm_unpackhi_epi16(yaux, yaux); + + tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); + z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_i_1 = _mm_cvtepi16_epi32(z); + z_ps_1 = _mm_cvtepi32_ps(z_i_1); + z = _mm_srli_si128 (z, 8); + z_i_2 = _mm_cvtepi16_epi32(z); + z_ps_2 = _mm_cvtepi32_ps(z_i_2); + + z_P = _mm_add_ps(z_P, z_ps_1); // Add the complex multiplication results together + z_P = _mm_add_ps(z_P, z_ps_2); // Add the complex multiplication results together + + // Late + y = _mm_lddqu_si128((__m128i*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); + yl = _mm_unpacklo_epi16(yaux, yaux); + yh = _mm_unpackhi_epi16(yaux, yaux); + + tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); + z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_i_1 = _mm_cvtepi16_epi32(z); + z_ps_1 = _mm_cvtepi32_ps(z_i_1); + z = _mm_srli_si128 (z, 8); + z_i_2 = _mm_cvtepi16_epi32(z); + z_ps_2 = _mm_cvtepi32_ps(z_i_2); + + z_L = _mm_add_ps(z_L, z_ps_1); // Add the complex multiplication results together + z_L = _mm_add_ps(z_L, z_ps_2); // Add the complex multiplication results together + + _input += 4; + _carrier += 4; + _E_code += 4; + _L_code += 4; + _P_code += 4; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; + + _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); + } + + for(int i=0; i < num_points%4; ++i) + { + dotProduct_E += (lv_32fc_t)((*_input) * (*_E_code++)*(*_carrier)); + dotProduct_P += (lv_32fc_t)((*_input) * (*_P_code++)*(*_carrier)); + dotProduct_L += (lv_32fc_t)((*_input++) * (*_L_code++)*(*_carrier++)); + } + + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; + + + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; + __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + unsigned int index = 0; + unsigned int indexPlus4 = 0; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(index = 0;index < 8*sse_iters; index+=8){ + indexPlus4 = index + 4; + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]); + x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]); + + y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]); + y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(; index < num_points; index++) + { + //Perform the carrier wipe-off + bb_signal_sample = input_ptr[index] * carrier_ptr[index]; + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x1, 2); + imagx = _mm_blend_epi16 (x2, imagx, 85); + realx = _mm_slli_si128 (x2, 2); + realx = _mm_blend_epi16 (realx, x1, 85); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y1, 2); + imagy = _mm_blend_epi16 (y2, imagy, 85); + realy = _mm_slli_si128 (y2, 2); + realy = _mm_blend_epi16 (realy, y1, 85); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ + +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; + __m128i input_i_1, input_i_2, output_i32; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) + + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) + CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ + +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; + __m128i input_i_1, input_i_2, output_i32; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, real_output, imag_output; + + __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; + __m128 real_output_ps, imag_output_ps; + + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + E_code_ptr += 4; + L_code_ptr += 4; + P_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + } + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H + +#include +#include +#include +#include +#include +// +//#ifdef LV_HAVE_SSE4_1 +//#include "smmintrin.h" +///*! +// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +// \param input The input signal input +// \param carrier The carrier signal input +// \param E_code Early PRN code replica input +// \param P_code Early PRN code replica input +// \param L_code Early PRN code replica input +// \param E_out Early correlation output +// \param P_out Early correlation output +// \param L_out Early correlation output +// \param num_points The number of complex values in vectors +// */ +//static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +//{ +// const unsigned int sse_iters = num_points / 8; +// +// __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; +// +// __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +// __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; +// __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; +// +// float E_out_real = 0; +// float E_out_imag = 0; +// float P_out_real = 0; +// float P_out_imag = 0; +// float L_out_real = 0; +// float L_out_imag = 0; +// +// const lv_16sc_t* input_ptr = input; +// const lv_16sc_t* carrier_ptr = carrier; +// +// const lv_16sc_t* E_code_ptr = E_code; +// lv_32fc_t* E_out_ptr = E_out; +// const lv_16sc_t* L_code_ptr = L_code; +// lv_32fc_t* L_out_ptr = L_out; +// const lv_16sc_t* P_code_ptr = P_code; +// lv_32fc_t* P_out_ptr = P_out; +// +// *E_out_ptr = 0; +// *P_out_ptr = 0; +// *L_out_ptr = 0; +// +// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +// +// real_E_code_acc = _mm_setzero_ps(); +// imag_E_code_acc = _mm_setzero_ps(); +// real_P_code_acc = _mm_setzero_ps(); +// imag_P_code_acc = _mm_setzero_ps(); +// real_L_code_acc = _mm_setzero_ps(); +// imag_L_code_acc = _mm_setzero_ps(); +// +// if (sse_iters>0) +// { +// for(int number = 0;number < sse_iters; number++){ +// +// //Perform the carrier wipe-off +// x1 = _mm_lddqu_si128((__m128i*)input_ptr); +// input_ptr += 4; +// x2 = _mm_lddqu_si128((__m128i*)input_ptr); +// +// y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +// carrier_ptr += 4; +// y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +// +// imagx = _mm_srli_si128 (x1, 2); +// imagx = _mm_blend_epi16 (x2, imagx, 85); +// realx = _mm_slli_si128 (x2, 2); +// realx = _mm_blend_epi16 (realx, x1, 85); +// +// imagy = _mm_srli_si128 (y1, 2); +// imagy = _mm_blend_epi16 (y2, imagy, 85); +// realy = _mm_slli_si128 (y2, 2); +// realy = _mm_blend_epi16 (realy, y1, 85); +// +// realx_mult_realy = _mm_mullo_epi16 (realx, realy); +// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +// +// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +// +// //Get early values +// y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +// E_code_ptr += 4; +// y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +// +// imagy = _mm_srli_si128 (y1, 2); +// imagy = _mm_blend_epi16 (y2, imagy, 85); +// realy = _mm_slli_si128 (y2, 2); +// realy = _mm_blend_epi16 (realy, y1, 85); +// +// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +// +// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +// +// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +// real_output = _mm_srli_si128 (real_output, 8); +// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +// +// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +// imag_output = _mm_srli_si128 (imag_output, 8); +// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +// +// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); +// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); +// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); +// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); +// +// //Get prompt values +// y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +// P_code_ptr += 4; +// y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +// +// imagy = _mm_srli_si128 (y1, 2); +// imagy = _mm_blend_epi16 (y2, imagy, 85); +// realy = _mm_slli_si128 (y2, 2); +// realy = _mm_blend_epi16 (realy, y1, 85); +// +// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +// +// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +// +// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +// real_output = _mm_srli_si128 (real_output, 8); +// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +// +// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +// imag_output = _mm_srli_si128 (imag_output, 8); +// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +// +// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); +// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); +// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); +// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); +// +// //Get late values +// y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +// L_code_ptr += 4; +// y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +// +// imagy = _mm_srli_si128 (y1, 2); +// imagy = _mm_blend_epi16 (y2, imagy, 85); +// realy = _mm_slli_si128 (y2, 2); +// realy = _mm_blend_epi16 (realy, y1, 85); +// +// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +// +// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +// +// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +// real_output = _mm_srli_si128 (real_output, 8); +// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +// +// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +// imag_output = _mm_srli_si128 (imag_output, 8); +// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +// +// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); +// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); +// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); +// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); +// +// input_ptr += 4; +// carrier_ptr += 4; +// E_code_ptr += 4; +// L_code_ptr += 4; +// P_code_ptr += 4; +// } +// +// __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +// __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +// +// _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +// _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +// +// for (int i = 0; i<4; ++i) +// { +// E_out_real += real_E_dotProductVector[i]; +// E_out_imag += imag_E_dotProductVector[i]; +// P_out_real += real_P_dotProductVector[i]; +// P_out_imag += imag_P_dotProductVector[i]; +// L_out_real += real_L_dotProductVector[i]; +// L_out_imag += imag_L_dotProductVector[i]; +// } +// *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +// *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +// *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +// } +// +// lv_16sc_t bb_signal_sample; +// for(int i=0; i < num_points%8; ++i) +// { +// //Perform the carrier wipe-off +// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +// // Now get early, late, and prompt values for each +// *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +// *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +// *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +// } +//} +//#endif /* LV_HAVE_SSE4_1 */ +// +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * E_code[i]; + tmp2 = bb_signal_sample * P_code[i]; + tmp3 = bb_signal_sample * L_code[i]; + + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t)tmp1; + *P_out += (lv_32fc_t)tmp2; + *L_out += (lv_32fc_t)tmp3; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h new file mode 100644 index 000000000..af207b92f --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h @@ -0,0 +1,595 @@ +/*! + * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the + * real part and 16 bits the imaginary part) and accumulates into float32 values, returning them: + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 32 bits vectors) It returns the input + * signal in base band (BB) + * - Very Early values are calculated by multiplying the input signal in BB by the + * very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * - Very Late values are calculated by multiplying the input signal in BB by the + * very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" + /*! + \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_16sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_lddqu_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_lddqu_si128((__m128i*)input_ptr); + + y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr); + VE_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr); + VL_code_ptr += 4; + y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + VE_code_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + VL_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + lv_16sc_t tmp4; + lv_16sc_t tmp5; + + bb_signal_sample = lv_cmake(0, 0); + + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * VE_code[i]; + tmp2 = bb_signal_sample * E_code[i]; + tmp3 = bb_signal_sample * P_code[i]; + tmp4 = bb_signal_sample * L_code[i]; + tmp5 = bb_signal_sample * VL_code[i]; + + // Now get early, late, and prompt values for each + *VE_out += (lv_32fc_t)tmp1; + *E_out += (lv_32fc_t)tmp2; + *P_out += (lv_32fc_t)tmp3; + *L_out += (lv_32fc_t)tmp4; + *VL_out += (lv_32fc_t)tmp5; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; + __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; + + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + const lv_16sc_t* input_ptr = input; + const lv_16sc_t* carrier_ptr = carrier; + + const lv_16sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_16sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_16sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_16sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_16sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x1 = _mm_load_si128((__m128i*)input_ptr); + input_ptr += 4; + x2 = _mm_load_si128((__m128i*)input_ptr); + + y1 = _mm_load_si128((__m128i*)carrier_ptr); + carrier_ptr += 4; + y2 = _mm_load_si128((__m128i*)carrier_ptr); + + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) + CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y1 = _mm_load_si128((__m128i*)VE_code_ptr); + VE_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y1 = _mm_load_si128((__m128i*)E_code_ptr); + E_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)E_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y1 = _mm_load_si128((__m128i*)P_code_ptr); + P_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)P_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y1 = _mm_load_si128((__m128i*)L_code_ptr); + L_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)L_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y1 = _mm_load_si128((__m128i*)VL_code_ptr); + VL_code_ptr += 4; + y2 = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 4; + carrier_ptr += 4; + VE_code_ptr += 4; + E_code_ptr += 4; + P_code_ptr += 4; + L_code_ptr += 4; + VL_code_ptr += 4; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +{ + lv_16sc_t bb_signal_sample; + lv_16sc_t tmp1; + lv_16sc_t tmp2; + lv_16sc_t tmp3; + lv_16sc_t tmp4; + lv_16sc_t tmp5; + + bb_signal_sample = lv_cmake(0, 0); + + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + // perform Early, Prompt and Late correlation + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + tmp1 = bb_signal_sample * VE_code[i]; + tmp2 = bb_signal_sample * E_code[i]; + tmp3 = bb_signal_sample * P_code[i]; + tmp4 = bb_signal_sample * L_code[i]; + tmp5 = bb_signal_sample * VL_code[i]; + + // Now get early, late, and prompt values for each + *VE_out += (lv_32fc_t)tmp1; + *E_out += (lv_32fc_t)tmp2; + *P_out += (lv_32fc_t)tmp3; + *L_out += (lv_32fc_t)tmp4; + *VL_out += (lv_32fc_t)tmp5; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h new file mode 100644 index 000000000..82f1b3efd --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h @@ -0,0 +1,68 @@ +#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H +#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){ + float returnValue = 0; + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* aPtr = inputBuffer; + __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; + + __m128 accumulator = _mm_setzero_ps(); + __m128 aVal = _mm_setzero_ps(); + + for(;number < quarterPoints; number++){ + aVal = _mm_load_ps(aPtr); + accumulator = _mm_add_ps(accumulator, aVal); + aPtr += 4; + } + _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container + returnValue = tempBuffer[0]; + returnValue += tempBuffer[1]; + returnValue += tempBuffer[2]; + returnValue += tempBuffer[3]; + + number = quarterPoints * 4; + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){ + const float* aPtr = inputBuffer; + unsigned int number = 0; + float returnValue = 0; + + for(;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h new file mode 100644 index 000000000..c815609b2 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h @@ -0,0 +1,149 @@ +#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H +#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + +static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_SSE +#include + +static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* inputPtr = (float*)src0; + + __m128 indexIncrementValues = _mm_set1_ps(4); + __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + + float max = src0[0]; + float index = 0; + __m128 maxValues = _mm_set1_ps(max); + __m128 maxValuesIndex = _mm_setzero_ps(); + __m128 compareResults; + __m128 currentValues; + + __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + + for(;number < quarterPoints; number++){ + + currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); + + compareResults = _mm_cmpgt_ps(maxValues, currentValues); + + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); + + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + } + + // Calculate the largest value from the remaining 4 points + _mm_store_ps(maxValuesBuffer, maxValues); + _mm_store_ps(maxIndexesBuffer, maxValuesIndex); + + for(number = 0; number < 4; number++){ + if(maxValuesBuffer[number] > max){ + index = maxIndexesBuffer[number]; + max = maxValuesBuffer[number]; + } + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + if(src0[number] > max){ + index = number; + max = src0[number]; + } + } + target[0] = (unsigned int)index; + } +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_GENERIC +static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) { + if(num_points > 0){ + float max = src0[0]; + unsigned int index = 0; + + unsigned int i = 1; + + for(; i < num_points; ++i) { + + if(src0[i] > max){ + index = i; + max = src0[i]; + } + + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h new file mode 100644 index 000000000..cd83cef9d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h @@ -0,0 +1,302 @@ +#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H +#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; + + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_loadu_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + \note Input buffer does NOT need to be properly aligned + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + *outputVectorPtr++ = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */ +#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H +#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int eighthPoints = num_points / 8; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(;number < eighthPoints; number++){ + inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; + + // Scale and clip + ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + number = eighthPoints * 8; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + unsigned int number = 0; + + const unsigned int quarterPoints = num_points / 4; + + const float* inputVectorPtr = (const float*)inputVector; + int16_t* outputVectorPtr = outputVector; + + float min_val = -32768; + float max_val = 32767; + float r; + + __m128 vScalar = _mm_set_ps1(scalar); + __m128 ret; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; + + for(;number < quarterPoints; number++){ + ret = _mm_load_ps(inputVectorPtr); + inputVectorPtr += 4; + + // Scale and clip + ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); + + _mm_store_ps(outputFloatBuffer, ret); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); + *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + r = inputVector[number] * scalar; + if(r > max_val) + r = max_val; + else if(r < min_val) + r = min_val; + outputVector[number] = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param scalar The value multiplied against each point in the input buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ + int16_t* outputVectorPtr = outputVector; + const float* inputVectorPtr = inputVector; + unsigned int number = 0; + float min_val = -32768; + float max_val = 32767; + float r; + + for(number = 0; number < num_points; number++){ + r = *inputVectorPtr++ * scalar; + if(r < min_val) + r = min_val; + else if(r > max_val) + r = max_val; + *outputVectorPtr++ = (int16_t)rintf(r); + } +} +#endif /* LV_HAVE_GENERIC */ + + + + +#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h new file mode 100644 index 000000000..ee647b2d7 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h @@ -0,0 +1,147 @@ +#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H +#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_loadu_ps(aPtr); + bVal = _mm_loadu_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */ +#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H +#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H + +#include +#include + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + + __m128 aVal, bVal, cVal; + for(;number < quarterPoints; number++){ + + aVal = _mm_load_ps(aPtr); + bVal = _mm_load_ps(bPtr); + + cVal = _mm_add_ps(aVal, bVal); + + _mm_store_ps(cPtr,cVal); // Store the results back into the C container + + aPtr += 4; + bPtr += 4; + cPtr += 4; + } + + number = quarterPoints * 4; + for(;number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + float* cPtr = cVector; + const float* aPtr = aVector; + const float* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector +*/ +extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ + volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + +#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h new file mode 100644 index 000000000..a3b8848aa --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h @@ -0,0 +1,127 @@ +#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H +#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_storeu_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */ +#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H +#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi + + x = _mm_xor_ps(x, conjugator); // conjugate register + + _mm_store_ps((float*)c,x); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = lv_conj(*a); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h new file mode 100644 index 000000000..ade8b6763 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h @@ -0,0 +1,295 @@ +/*! + * \file volk_gnsssdr_32fc_convert_16ic.h + * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H +#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/4; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + + float min_val = -32768; + float max_val = 32767; + + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/4; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + + float min_val = -32768; + float max_val = 32767; + + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + float min_val = -32768; + float max_val = 32767; + + for(unsigned int i = 0; i < num_points*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H +#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/4; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + + float min_val = -32768; + float max_val = 32767; + + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/4; + + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + + float min_val = -32768; + float max_val = 32767; + + __m128 inputVal1, inputVal2; + __m128i intInputVal1, intInputVal2; + __m128 ret1, ret2; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); + outputVectorPtr += 8; + } + + for(unsigned int i = 0; i < (num_points%4)*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + float* inputVectorPtr = (float*)inputVector; + int16_t* outputVectorPtr = (int16_t*)outputVector; + float min_val = -32768; + float max_val = 32767; + + for(unsigned int i = 0; i < num_points*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h new file mode 100755 index 000000000..5a97b4827 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h @@ -0,0 +1,213 @@ +/*! + * \file volk_gnsssdr_32fc_convert_8ic.h + * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H +#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/8; + + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + + float min_val = -128; + float max_val = 127; + + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128i int8InputVal; + __m128 ret1, ret2, ret3, ret4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal3 = _mm_cvtps_epi32(ret3); + intInputVal4 = _mm_cvtps_epi32(ret4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); + int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 16; + } + + for(unsigned int i = 0; i < (num_points%4)*4; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + float min_val = -128; + float max_val = 127; + + for(unsigned int i = 0; i < num_points*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H +#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + const unsigned int sse_iters = num_points/8; + + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + + float min_val = -128; + float max_val = 127; + + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128i int8InputVal; + __m128 ret1, ret2, ret3, ret4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal3 = _mm_cvtps_epi32(ret3); + intInputVal4 = _mm_cvtps_epi32(ret4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); + int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 16; + } + + for(unsigned int i = 0; i < (num_points%4)*4; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + float min_val = -128; + float max_val = 127; + + for(unsigned int i = 0; i < num_points*2; i++){ + if(inputVectorPtr[i] > max_val) + inputVectorPtr[i] = max_val; + else if(inputVectorPtr[i] < min_val) + inputVectorPtr[i] = min_val; + outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h new file mode 100644 index 000000000..ce28f866e --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h @@ -0,0 +1,228 @@ +#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H +#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_loadu_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_storeu_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ +#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H +#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + + result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_SSE +#include + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int quarterPoints = num_points / 4; + + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + + __m128 cplxValue1, cplxValue2, iValue, qValue, result; + for(;number < quarterPoints; number++){ + cplxValue1 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + cplxValue2 = _mm_load_ps(complexVectorPtr); + complexVectorPtr += 4; + + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); + + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + + result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values + + _mm_store_ps(magnitudeVectorPtr, result); + magnitudeVectorPtr += 4; + } + + number = quarterPoints * 4; + for(; number < num_points; number++){ + float val1Real = *complexVectorPtr++; + float val1Imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ + const float* complexVectorPtr = (float*)complexVector; + float* magnitudeVectorPtr = magnitudeVector; + unsigned int number = 0; + for(number = 0; number < num_points; number++){ + const float real = *complexVectorPtr++; + const float imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h new file mode 100644 index 000000000..9c33c9870 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h @@ -0,0 +1,231 @@ +/*! + * \file volk_gnsssdr_32fc_s32f_convert_8ic.h + * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H +#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ + const unsigned int sse_iters = num_points/8; + + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + float min_val = -128; + float max_val = 127; + + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128i int8InputVal; + __m128 ret1, ret2, ret3, ret4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + inputVal1 = _mm_mul_ps(inputVal1, invScalar); + inputVal2 = _mm_mul_ps(inputVal2, invScalar); + inputVal3 = _mm_mul_ps(inputVal3, invScalar); + inputVal4 = _mm_mul_ps(inputVal4, invScalar); + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal3 = _mm_cvtps_epi32(ret3); + intInputVal4 = _mm_cvtps_epi32(ret4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); + int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); + + _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 16; + } + + float scaled = 0; + for(unsigned int i = 0; i < (num_points%4)*4; i++){ + scaled = inputVectorPtr[i]/scalar; + if(scaled > max_val) + scaled = max_val; + else if(scaled < min_val) + scaled = min_val; + outputVectorPtr[i] = (int8_t)rintf(scaled); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + float scaled = 0; + float min_val = -128; + float max_val = 127; + + for(unsigned int i = 0; i < num_points*2; i++){ + scaled = (inputVectorPtr[i])/scalar; + if(scaled > max_val) + scaled = max_val; + else if(scaled < min_val) + scaled = min_val; + outputVectorPtr[i] = (int8_t)rintf(scaled); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H +#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ + const unsigned int sse_iters = num_points/8; + + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + __m128 invScalar = _mm_set_ps1(1.0/scalar); + + float min_val = -128; + float max_val = 127; + + __m128 inputVal1, inputVal2, inputVal3, inputVal4; + __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; + __m128i int8InputVal; + __m128 ret1, ret2, ret3, ret4; + __m128 vmin_val = _mm_set_ps1(min_val); + __m128 vmax_val = _mm_set_ps1(max_val); + + for(unsigned int i = 0;i < sse_iters; i++){ + inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + + inputVal1 = _mm_mul_ps(inputVal1, invScalar); + inputVal2 = _mm_mul_ps(inputVal2, invScalar); + inputVal3 = _mm_mul_ps(inputVal3, invScalar); + inputVal4 = _mm_mul_ps(inputVal4, invScalar); + // Clip + ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); + ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); + ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); + ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); + + intInputVal1 = _mm_cvtps_epi32(ret1); + intInputVal2 = _mm_cvtps_epi32(ret2); + intInputVal3 = _mm_cvtps_epi32(ret3); + intInputVal4 = _mm_cvtps_epi32(ret4); + + intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); + intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); + int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); + + _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); + outputVectorPtr += 16; + } + + float scaled = 0; + for(unsigned int i = 0; i < (num_points%4)*4; i++){ + scaled = inputVectorPtr[i]/scalar; + if(scaled > max_val) + scaled = max_val; + else if(scaled < min_val) + scaled = min_val; + outputVectorPtr[i] = (int8_t)rintf(scaled); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) + \param inputVector The floating point input data buffer + \param outputVector The 16 bit output data buffer + \param num_points The number of data values to be converted + */ +static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ + float* inputVectorPtr = (float*)inputVector; + int8_t* outputVectorPtr = (int8_t*)outputVector; + float scaled = 0; + float min_val = -128; + float max_val = 127; + + for(unsigned int i = 0; i < num_points*2; i++){ + scaled = inputVectorPtr[i]/scalar; + if(scaled > max_val) + scaled = max_val; + else if(scaled < min_val) + scaled = min_val; + outputVectorPtr[i] = (int8_t)rintf(scaled); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h new file mode 100644 index 000000000..0b5761176 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h @@ -0,0 +1,266 @@ +/*! + * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc + * \brief Volk protokernel: replaces the tracking function for update_local_code + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that replaces the tracking function for update_local_code + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H +#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ + +// float* pointer1 = (float*)&d_very_early_late_spc_chips; +// *pointer1 = 1; +// float* pointer2 = (float*)&code_length_half_chips; +// *pointer2 = 6; +// float* pointer3 = (float*)&code_phase_step_half_chips; +// *pointer3 = 7; +// float* pointer4 = (float*)&tcode_half_chips_input; +// *pointer4 = 8; + + const unsigned int sse_iters = num_points / 4; + + __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; + + __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); + __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); + __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); + __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); + __m128 twos = _mm_set1_ps (2); + __m128i associated_chip_index_array_int; + + __VOLK_ATTR_ALIGNED(16) int32_t output[4]; + + for (unsigned int i = 0; i < sse_iters; i++) + { + //fmod = numer - tquot * denom; tquot = numer/denom truncated + //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); + fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); + tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); + tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); + fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); + + associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); + associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); + associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); + _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int); + + //d_very_early_code[i] = d_ca_code[associated_chip_index]; + *d_very_early_code++ = d_ca_code[output[0]]; + *d_very_early_code++ = d_ca_code[output[1]]; + *d_very_early_code++ = d_ca_code[output[2]]; + *d_very_early_code++ = d_ca_code[output[3]]; + + //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); + } + + if (num_points%4!=0) + { + __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; + _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); + + int associated_chip_index; + float tcode_half_chips = tcode_half_chips_stored[0]; + float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; + + for (unsigned int i = 0; i < num_points%4; i++) + { + associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); + d_very_early_code[i] = d_ca_code[associated_chip_index]; + tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + } + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ + + float* pointer1 = (float*)&d_very_early_late_spc_chips; + *pointer1 = 1; + float* pointer2 = (float*)&code_length_half_chips; + *pointer2 = 6; + float* pointer3 = (float*)&code_phase_step_half_chips; + *pointer3 = 7; + float* pointer4 = (float*)&tcode_half_chips_input; + *pointer4 = 8; + + int associated_chip_index; + float tcode_half_chips = tcode_half_chips_input; + float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; + + for (unsigned int i = 0; i < num_points; i++) + { + associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); + d_very_early_code[i] = d_ca_code[associated_chip_index]; + tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H */ +#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H +#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ + + // float* pointer1 = (float*)&d_very_early_late_spc_chips; + // *pointer1 = 1; + // float* pointer2 = (float*)&code_length_half_chips; + // *pointer2 = 6; + // float* pointer3 = (float*)&code_phase_step_half_chips; + // *pointer3 = 7; + // float* pointer4 = (float*)&tcode_half_chips_input; + // *pointer4 = 8; + + const unsigned int sse_iters = num_points / 4; + + __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; + + __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); + __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); + __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); + __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); + __m128 twos = _mm_set1_ps (2); + __m128i associated_chip_index_array_int; + + __VOLK_ATTR_ALIGNED(16) int32_t output[4]; + + for (unsigned int i = 0; i < sse_iters; i++) + { + //fmod = numer - tquot * denom; tquot = numer/denom truncated + //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); + fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); + tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); + tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); + fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); + + associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); + associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); + associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); + _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int); + + //d_very_early_code[i] = d_ca_code[associated_chip_index]; + *d_very_early_code++ = d_ca_code[output[0]]; + *d_very_early_code++ = d_ca_code[output[1]]; + *d_very_early_code++ = d_ca_code[output[2]]; + *d_very_early_code++ = d_ca_code[output[3]]; + + //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); + } + + if (num_points%4!=0) + { + __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; + _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); + + int associated_chip_index; + float tcode_half_chips = tcode_half_chips_stored[0]; + float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; + + for (unsigned int i = 0; i < num_points%4; i++) + { + associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); + d_very_early_code[i] = d_ca_code[associated_chip_index]; + tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + } + } + +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Takes the conjugate of a complex vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of complex values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ + + // float* pointer1 = (float*)&d_very_early_late_spc_chips; + // *pointer1 = 1; + // float* pointer2 = (float*)&code_length_half_chips; + // *pointer2 = 6; + // float* pointer3 = (float*)&code_phase_step_half_chips; + // *pointer3 = 7; + // float* pointer4 = (float*)&tcode_half_chips_input; + // *pointer4 = 8; + + int associated_chip_index; + float tcode_half_chips = tcode_half_chips_input; + float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; + + for (unsigned int i = 0; i < num_points; i++) + { + associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); + d_very_early_code[i] = d_ca_code[associated_chip_index]; + tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h new file mode 100644 index 000000000..d5135d89f --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h @@ -0,0 +1,178 @@ +#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H +#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +*/ +static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ +#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H +#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + + // Set up constant scalar vector + yl = _mm_set_ps1(lv_creal(scalar)); + yh = _mm_set_ps1(lv_cimag(scalar)); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * scalar; + } +} +#endif /* LV_HAVE_SSE */ + + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + unsigned int number = num_points; + + // unwrap loop + while (number >= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + + + + + +#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h new file mode 100644 index 000000000..08a10aa6e --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h @@ -0,0 +1,763 @@ +#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H +#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H + +#include +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_points/2; + unsigned int isodd = num_points & 1; + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + unsigned int i = 0; + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + in += 4; + tp += 4; + } + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + // Cleanup if we had an odd number of points + for(i = 0; i < isodd; ++i) { + *result += input[num_points - 1] * taps[num_points - 1]; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + + +#if LV_HAVE_SSE && LV_HAVE_64 + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + unsigned int isodd = num_points & 1; + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movups 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movups 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movups (%%r9), %%xmmA\n\t" + "# movups (%%r10), %%xmmB\n\t" + "# movups %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movups %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movups %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movups 16(%%r9), %%xmm1\n\t" + " movups %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movups 16(%%r10), %%xmm3\n\t" + " movups %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movups 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movups 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movups %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movups %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movups %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + if(isodd) { + *result += input[num_points - 1] * taps[num_points - 1]; + } + + return; + +} + +#endif /* LV_HAVE_SSE && LV_HAVE_64 */ + + + + +#ifdef LV_HAVE_SSE3 + +#include + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_points/2; + unsigned int isodd = num_points & 1; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if(isodd) { + dotProduct += input[num_points - 1] * taps[num_points - 1]; + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int i = 0; + const unsigned int qtr_points = num_points/4; + const unsigned int isodd = num_points & 3; + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + static const __m128i neg = {0x000000000000000080000000}; + + real0 = _mm_setzero_ps(); + real1 = _mm_setzero_ps(); + im0 = _mm_setzero_ps(); + im1 = _mm_setzero_ps(); + + for(; i < qtr_points; ++i) { + xmm0 = _mm_loadu_ps(p_input); + xmm1 = _mm_loadu_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_loadu_ps(p_input); + xmm3 = _mm_loadu_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + } + + real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = num_points-isodd; i < num_points; i++) { + *result += input[i] * taps[i]; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + + + + +#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/ +#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H +#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H + +#include +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC + + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + + float * res = (float*) result; + float * in = (float*) input; + float * tp = (float*) taps; + unsigned int n_2_ccomplex_blocks = num_bytes >> 4; + unsigned int isodd = num_points & 1; + + float sum0[2] = {0,0}; + float sum1[2] = {0,0}; + unsigned int i = 0; + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + in += 4; + tp += 4; + } + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + for(i = 0; i < isodd; ++i) { + *result += input[num_points - 1] * taps[num_points - 1]; + } +} + +#endif /*LV_HAVE_GENERIC*/ + + +#if LV_HAVE_SSE && LV_HAVE_64 + + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + unsigned int isodd = num_points & 1; + + asm + ( + "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" + "# const float *taps, unsigned num_bytes)\n\t" + "# float sum0 = 0;\n\t" + "# float sum1 = 0;\n\t" + "# float sum2 = 0;\n\t" + "# float sum3 = 0;\n\t" + "# do {\n\t" + "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" + "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" + "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" + "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" + "# input += 4;\n\t" + "# taps += 4; \n\t" + "# } while (--n_2_ccomplex_blocks != 0);\n\t" + "# result[0] = sum0 + sum2;\n\t" + "# result[1] = sum1 + sum3;\n\t" + "# TODO: prefetch and better scheduling\n\t" + " xor %%r9, %%r9\n\t" + " xor %%r10, %%r10\n\t" + " movq %%rcx, %%rax\n\t" + " movq %%rcx, %%r8\n\t" + " movq %[rsi], %%r9\n\t" + " movq %[rdx], %%r10\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%r9), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%r10), %%xmm2\n\t" + " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" + " shr $4, %%r8\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%r9), %%xmmA\n\t" + "# movaps (%%r10), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%r9), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%r10), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%r9), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " add $32, %%r9\n\t" + " movaps 32(%%r10), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " add $32, %%r10\n\t" + ".%=L1_test:\n\t" + " dec %%rax\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " and $1, %%r8\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " xorps %%xmm1, %%xmm1\n\t" + " mov $0x80000000, %%r9\n\t" + " movd %%r9, %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" + : + :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) + :"rax", "r8", "r9", "r10" + ); + + + if(isodd) { + *result += input[num_points - 1] * taps[num_points - 1]; + } + + return; + +} + +#endif + +#if LV_HAVE_SSE && LV_HAVE_32 + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); + +#if 0 + const unsigned int num_bytes = num_points*8; + unsigned int isodd = num_points & 1; + + asm volatile + ( + " #pushl %%ebp\n\t" + " #movl %%esp, %%ebp\n\t" + " movl 12(%%ebp), %%eax # input\n\t" + " movl 16(%%ebp), %%edx # taps\n\t" + " movl 20(%%ebp), %%ecx # n_bytes\n\t" + " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" + " movaps 0(%%eax), %%xmm0\n\t" + " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" + " movaps 0(%%edx), %%xmm2\n\t" + " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" + " jmp .%=L1_test\n\t" + " # 4 taps / loop\n\t" + " # something like ?? cycles / loop\n\t" + ".%=Loop1: \n\t" + "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" + "# movaps (%%eax), %%xmmA\n\t" + "# movaps (%%edx), %%xmmB\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" + "# mulps %%xmmB, %%xmmA\n\t" + "# mulps %%xmmZ, %%xmmB\n\t" + "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" + "# xorps %%xmmPN, %%xmmA\n\t" + "# movaps %%xmmA, %%xmmZ\n\t" + "# unpcklps %%xmmB, %%xmmA\n\t" + "# unpckhps %%xmmB, %%xmmZ\n\t" + "# movaps %%xmmZ, %%xmmY\n\t" + "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" + "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" + "# addps %%xmmZ, %%xmmA\n\t" + "# addps %%xmmA, %%xmmC\n\t" + "# A=xmm0, B=xmm2, Z=xmm4\n\t" + "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" + " movaps 16(%%eax), %%xmm1\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " movaps 16(%%edx), %%xmm3\n\t" + " movaps %%xmm1, %%xmm5\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm3, %%xmm1\n\t" + " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" + " addps %%xmm1, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " movaps 32(%%eax), %%xmm0\n\t" + " addps %%xmm2, %%xmm7\n\t" + " mulps %%xmm5, %%xmm3\n\t" + " addl $32, %%eax\n\t" + " movaps 32(%%edx), %%xmm2\n\t" + " addps %%xmm3, %%xmm7\n\t" + " addl $32, %%edx\n\t" + ".%=L1_test:\n\t" + " decl %%ecx\n\t" + " jge .%=Loop1\n\t" + " # We've handled the bulk of multiplies up to here.\n\t" + " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" + " # If so, we've got 2 more taps to do.\n\t" + " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" + " shrl $4, %%ecx\n\t" + " andl $1, %%ecx\n\t" + " je .%=Leven\n\t" + " # The count was odd, do 2 more taps.\n\t" + " # Note that we've already got mm0/mm2 preloaded\n\t" + " # from the main loop.\n\t" + " movaps %%xmm0, %%xmm4\n\t" + " mulps %%xmm2, %%xmm0\n\t" + " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" + " addps %%xmm0, %%xmm6\n\t" + " mulps %%xmm4, %%xmm2\n\t" + " addps %%xmm2, %%xmm7\n\t" + ".%=Leven:\n\t" + " # neg inversor\n\t" + " movl 8(%%ebp), %%eax \n\t" + " xorps %%xmm1, %%xmm1\n\t" + " movl $0x80000000, (%%eax)\n\t" + " movss (%%eax), %%xmm1\n\t" + " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" + " # pfpnacc\n\t" + " xorps %%xmm1, %%xmm6\n\t" + " movaps %%xmm6, %%xmm2\n\t" + " unpcklps %%xmm7, %%xmm6\n\t" + " unpckhps %%xmm7, %%xmm2\n\t" + " movaps %%xmm2, %%xmm3\n\t" + " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" + " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" + " addps %%xmm2, %%xmm6\n\t" + " # xmm6 = r1 i2 r3 i4\n\t" + " #movl 8(%%ebp), %%eax # @result\n\t" + " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" + " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" + " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" + " #popl %%ebp\n\t" + : + : + : "eax", "ecx", "edx" + ); + + + int getem = num_bytes % 16; + + if(isodd) { + *result += (input[num_points - 1] * taps[num_points - 1]); + } + + return; +#endif +} + +#endif /*LV_HAVE_SSE*/ + +#ifdef LV_HAVE_SSE3 + +#include + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + const unsigned int num_bytes = num_points*8; + unsigned int isodd = num_points & 1; + + lv_32fc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(float)); + + unsigned int number = 0; + const unsigned int halfPoints = num_bytes >> 4; + + __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; + + const lv_32fc_t* a = input; + const lv_32fc_t* b = taps; + + dotProdVal = _mm_setzero_ps(); + + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together + + a += 2; + b += 2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + + _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector + + dotProduct += ( dotProductVector[0] + dotProductVector[1] ); + + if(isodd) { + dotProduct += input[num_points - 1] * taps[num_points - 1]; + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE3*/ + +#ifdef LV_HAVE_SSE4_1 + +#include + +static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { + + unsigned int i = 0; + const unsigned int qtr_points = num_points/4; + const unsigned int isodd = num_points & 3; + + __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; + float *p_input, *p_taps; + __m64 *p_result; + + static const __m128i neg = {0x000000000000000080000000}; + + p_result = (__m64*)result; + p_input = (float*)input; + p_taps = (float*)taps; + + real0 = _mm_setzero_ps(); + real1 = _mm_setzero_ps(); + im0 = _mm_setzero_ps(); + im1 = _mm_setzero_ps(); + + for(; i < qtr_points; ++i) { + xmm0 = _mm_load_ps(p_input); + xmm1 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm2 = _mm_load_ps(p_input); + xmm3 = _mm_load_ps(p_taps); + + p_input += 4; + p_taps += 4; + + xmm4 = _mm_unpackhi_ps(xmm0, xmm2); + xmm5 = _mm_unpackhi_ps(xmm1, xmm3); + xmm0 = _mm_unpacklo_ps(xmm0, xmm2); + xmm2 = _mm_unpacklo_ps(xmm1, xmm3); + + //imaginary vector from input + xmm1 = _mm_unpackhi_ps(xmm0, xmm4); + //real vector from input + xmm3 = _mm_unpacklo_ps(xmm0, xmm4); + //imaginary vector from taps + xmm0 = _mm_unpackhi_ps(xmm2, xmm5); + //real vector from taps + xmm2 = _mm_unpacklo_ps(xmm2, xmm5); + + xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); + xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); + + xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); + xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); + + real0 = _mm_add_ps(xmm4, real0); + real1 = _mm_add_ps(xmm5, real1); + im0 = _mm_add_ps(xmm6, im0); + im1 = _mm_add_ps(xmm7, im1); + } + + real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); + + im0 = _mm_add_ps(im0, im1); + real0 = _mm_add_ps(real0, real1); + + im0 = _mm_add_ps(im0, real0); + + _mm_storel_pi(p_result, im0); + + for(i = num_points-isodd; i < num_points; i++) { + *result += input[i] * taps[i]; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h new file mode 100644 index 000000000..e2b17c401 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h @@ -0,0 +1,170 @@ +#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H +#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + + for(;number < halfPoints; number++){ + + x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_storeu_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + + +#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ +#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H +#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + __m128 x, y, yl, yh, z, tmp1, tmp2; + lv_32fc_t* c = cVector; + const lv_32fc_t* a = aVector; + const lv_32fc_t* b = bVector; + for(;number < halfPoints; number++){ + + x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + _mm_store_ps((float*)c,z); // Store the results back into the C container + + a += 2; + b += 2; + c += 2; + } + + if((num_points % 2) != 0) { + *c = (*a) * (*b); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + lv_32fc_t* cPtr = cVector; + const lv_32fc_t* aPtr = aVector; + const lv_32fc_t* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC + /*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); +static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ + volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + + + + + +#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h new file mode 100644 index 000000000..7e05be9cf --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h @@ -0,0 +1,409 @@ +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H + +#include +#include +#include +#include +#include + +/*! + * TODO: Code the SSE4 version and benchmark it + */ +#ifdef LV_HAVE_SSE3 +#include + + + /*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + lv_32fc_t dotProduct_E; + memset(&dotProduct_E, 0x0, 2*sizeof(float)); + lv_32fc_t dotProduct_P; + memset(&dotProduct_P, 0x0, 2*sizeof(float)); + lv_32fc_t dotProduct_L; + memset(&dotProduct_L, 0x0, 2*sizeof(float)); + + // Aux vars + __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; + + z_E = _mm_setzero_ps(); + z_P = _mm_setzero_ps(); + z_L = _mm_setzero_ps(); + + //input and output vectors + //lv_32fc_t* _input_BB = input_BB; + const lv_32fc_t* _input = input; + const lv_32fc_t* _carrier = carrier; + const lv_32fc_t* _E_code = E_code; + const lv_32fc_t* _P_code = P_code; + const lv_32fc_t* _L_code = L_code; + + for(;number < halfPoints; number++) + { + // carrier wipe-off (vector point-to-point product) + x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container + + // correlation E,P,L (3x vector scalar product) + // Early + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + x = z; + + y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 2; + _input += 2; + //_input_BB += 2; + _E_code += 2; + _P_code += 2; + _L_code +=2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; + //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; + + _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + + dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); + dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); + dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); + + if((num_points % 2) != 0) + { + //_input_BB = (*_input) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code)*(*_carrier); + dotProduct_P += (*_input) * (*_P_code)*(*_carrier); + dotProduct_L += (*_input) * (*_L_code)*(*_carrier); + } + + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; +} + +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) +{ + lv_32fc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + lv_32fc_t dotProduct_E; + memset(&dotProduct_E, 0x0, 2*sizeof(float)); + lv_32fc_t dotProduct_P; + memset(&dotProduct_P, 0x0, 2*sizeof(float)); + lv_32fc_t dotProduct_L; + memset(&dotProduct_L, 0x0, 2*sizeof(float)); + + // Aux vars + __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; + + z_E = _mm_setzero_ps(); + z_P = _mm_setzero_ps(); + z_L = _mm_setzero_ps(); + + //input and output vectors + //lv_32fc_t* _input_BB = input_BB; + const lv_32fc_t* _input = input; + const lv_32fc_t* _carrier = carrier; + const lv_32fc_t* _E_code = E_code; + const lv_32fc_t* _P_code = P_code; + const lv_32fc_t* _L_code = L_code; + + for(;number < halfPoints; number++) + { + // carrier wipe-off (vector point-to-point product) + x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container + + // correlation E,P,L (3x vector scalar product) + // Early + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + x = z; + + y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + + z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 2; + _input += 2; + //_input_BB += 2; + _E_code += 2; + _P_code += 2; + _L_code +=2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; + //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; + + _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + + dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); + dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); + dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); + + if((num_points % 2) != 0) + { + //_input_BB = (*_input) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code)*(*_carrier); + dotProduct_P += (*_input) * (*_P_code)*(*_carrier); + dotProduct_L += (*_input) * (*_L_code)*(*_carrier); + } + + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; +} + +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) +{ + lv_32fc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h new file mode 100644 index 000000000..7b66b6491 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h @@ -0,0 +1,848 @@ +/*! + * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5 + * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors + * \authors
    + *
  • Javier Arribas, 2011. jarribas(at)cttc.es + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * VE, Early, Prompt, Late and VL correlation with 64 bits vectors (32 bits the + * real part and 32 bits the imaginary part): + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 64 bits vectors) It returns the input + * signal in base band (BB) + * - VE values are calculated by multiplying the input signal in BB by the + * VE code (multiplication of 64 bits vectors), accumulating the results + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 64 bits vectors), accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 64 bits vectors), accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 64 bits vectors), accumulating the results + * - VL values are calculated by multiplying the input signal in BB by the + * VL code (multiplication of 64 bits vectors), accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code VE PRN code replica input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param VL_code VL PRN code replica input + \param VE_out VE correlation output + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param VL_out VL correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 4; + + lv_32fc_t dotProduct_VE; + lv_32fc_t dotProduct_E; + lv_32fc_t dotProduct_P; + lv_32fc_t dotProduct_L; + lv_32fc_t dotProduct_VL; + + // Aux vars + __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; + __m256 bb_signal_sample, bb_signal_sample_shuffled; + + z_VE = _mm256_setzero_ps(); + z_E = _mm256_setzero_ps(); + z_P = _mm256_setzero_ps(); + z_L = _mm256_setzero_ps(); + z_VL = _mm256_setzero_ps(); + + //input and output vectors + const lv_32fc_t* _input = input; + const lv_32fc_t* _carrier = carrier; + const lv_32fc_t* _VE_code = VE_code; + const lv_32fc_t* _E_code = E_code; + const lv_32fc_t* _P_code = P_code; + const lv_32fc_t* _L_code = L_code; + const lv_32fc_t* _VL_code = VL_code; + + for(;number < halfPoints; number++) + { + // carrier wipe-off (vector point-to-point product) + x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 4; + _input += 4; + _VE_code += 4; + _E_code += 4; + _P_code += 4; + _L_code += 4; + _VL_code += 4; + } + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; + + _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector + + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); + dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); + + for (int i = 0; i<(num_points % 4); ++i) + { + dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); + dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); + dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); + dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); + } + + *VE_out = dotProduct_VE; + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; + *VL_out = dotProduct_VL; +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE3 +#include + /*! + \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code VE PRN code replica input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param VL_code VL PRN code replica input + \param VE_out VE correlation output + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param VL_out VL correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + lv_32fc_t dotProduct_VE; + lv_32fc_t dotProduct_E; + lv_32fc_t dotProduct_P; + lv_32fc_t dotProduct_L; + lv_32fc_t dotProduct_VL; + + // Aux vars + __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; + __m128 bb_signal_sample, bb_signal_sample_shuffled; + + z_VE = _mm_setzero_ps(); + z_E = _mm_setzero_ps(); + z_P = _mm_setzero_ps(); + z_L = _mm_setzero_ps(); + z_VL = _mm_setzero_ps(); + + //input and output vectors + const lv_32fc_t* _input = input; + const lv_32fc_t* _carrier = carrier; + const lv_32fc_t* _VE_code = VE_code; + const lv_32fc_t* _E_code = E_code; + const lv_32fc_t* _P_code = P_code; + const lv_32fc_t* _L_code = L_code; + const lv_32fc_t* _VL_code = VL_code; + + for(;number < halfPoints; number++) + { + // carrier wipe-off (vector point-to-point product) + x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 2; + _input += 2; + _VE_code += 2; + _E_code += 2; + _P_code += 2; + _L_code +=2; + _VL_code +=2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; + + _mm_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector + + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); + dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); + + if((num_points % 2) != 0) + { + dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); + dotProduct_E += (*_input) * (*_E_code)*(*_carrier); + dotProduct_P += (*_input) * (*_P_code)*(*_carrier); + dotProduct_L += (*_input) * (*_L_code)*(*_carrier); + dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); + } + + *VE_out = dotProduct_VE; + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; + *VL_out = dotProduct_VL; +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code VE PRN code replica input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param VL_code VL PRN code replica input + \param VE_out VE correlation output + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param VL_out VL correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +{ + lv_32fc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *VE_out += bb_signal_sample * VE_code[i]; + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + *VL_out += bb_signal_sample * VL_code[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code VE PRN code replica input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param VL_code VL PRN code replica input + \param VE_out VE correlation output + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param VL_out VL correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 4; + + lv_32fc_t dotProduct_VE; + lv_32fc_t dotProduct_E; + lv_32fc_t dotProduct_P; + lv_32fc_t dotProduct_L; + lv_32fc_t dotProduct_VL; + + // Aux vars + __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; + __m256 bb_signal_sample, bb_signal_sample_shuffled; + + z_VE = _mm256_setzero_ps(); + z_E = _mm256_setzero_ps(); + z_P = _mm256_setzero_ps(); + z_L = _mm256_setzero_ps(); + z_VL = _mm256_setzero_ps(); + + //input and output vectors + const lv_32fc_t* _input = input; + const lv_32fc_t* _carrier = carrier; + const lv_32fc_t* _VE_code = VE_code; + const lv_32fc_t* _E_code = E_code; + const lv_32fc_t* _P_code = P_code; + const lv_32fc_t* _L_code = L_code; + const lv_32fc_t* _VL_code = VL_code; + + for(;number < halfPoints; number++) + { + // carrier wipe-off (vector point-to-point product) + x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 4; + _input += 4; + _VE_code += 4; + _E_code += 4; + _P_code += 4; + _L_code += 4; + _VL_code += 4; + } + + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; + __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; + + _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector + + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); + dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); + + for (int i = 0; i<(num_points % 4); ++i) + { + dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); + dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); + dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); + dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); + dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); + } + + *VE_out = dotProduct_VE; + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; + *VL_out = dotProduct_VL; +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code VE PRN code replica input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param VL_code VL PRN code replica input + \param VE_out VE correlation output + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param VL_out VL correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +{ + unsigned int number = 0; + const unsigned int halfPoints = num_points / 2; + + lv_32fc_t dotProduct_VE; + lv_32fc_t dotProduct_E; + lv_32fc_t dotProduct_P; + lv_32fc_t dotProduct_L; + lv_32fc_t dotProduct_VL; + + // Aux vars + __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; + __m128 bb_signal_sample, bb_signal_sample_shuffled; + + z_VE = _mm_setzero_ps(); + z_E = _mm_setzero_ps(); + z_P = _mm_setzero_ps(); + z_L = _mm_setzero_ps(); + z_VL = _mm_setzero_ps(); + + //input and output vectors + const lv_32fc_t* _input = input; + const lv_32fc_t* _carrier = carrier; + const lv_32fc_t* _VE_code = VE_code; + const lv_32fc_t* _E_code = E_code; + const lv_32fc_t* _P_code = P_code; + const lv_32fc_t* _L_code = L_code; + const lv_32fc_t* _VL_code = VL_code; + + for(;number < halfPoints; number++) + { + // carrier wipe-off (vector point-to-point product) + x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + + x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br + + tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br + + // correlation VE,E,P,L,VL (5x vector scalar product) + // VE + y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together + + // Early + y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together + + // Prompt + y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together + + // Late + y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together + + // VL + //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi + y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di + + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + + tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + + z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together + + /*pointer increment*/ + _carrier += 2; + _input += 2; + _VE_code += 2; + _E_code += 2; + _P_code += 2; + _L_code +=2; + _VL_code +=2; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; + + _mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector + _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector + + dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); + dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); + dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); + dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); + dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); + + if((num_points % 2) != 0) + { + dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); + dotProduct_E += (*_input) * (*_E_code)*(*_carrier); + dotProduct_P += (*_input) * (*_P_code)*(*_carrier); + dotProduct_L += (*_input) * (*_L_code)*(*_carrier); + dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); + } + + *VE_out = dotProduct_VE; + *E_out = dotProduct_E; + *P_out = dotProduct_P; + *L_out = dotProduct_L; + *VL_out = dotProduct_VL; +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code VE PRN code replica input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param VL_code VL PRN code replica input + \param VE_out VE correlation output + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param VL_out VL correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +{ + lv_32fc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *VE_out += bb_signal_sample * VE_code[i]; + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + *VL_out += bb_signal_sample * VL_code[i]; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h new file mode 100644 index 000000000..6acb25390 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h @@ -0,0 +1,243 @@ +/*! + * \file volk_gnsssdr_64f_accumulator_64f.h + * \brief Volk protokernel: 64 bits (double) scalar accumulator + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that implements an accumulator of char values + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H +#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){ + double returnValue = 0; + const unsigned int sse_iters = num_points / 4; + + const double* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; + __m256d accumulator = _mm256_setzero_pd(); + __m256d aVal = _mm256_setzero_pd(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm256_loadu_pd(aPtr); + accumulator = _mm256_add_pd(accumulator, aVal); + aPtr += 4; + } + + _mm256_storeu_pd((double*)tempBuffer,accumulator); + + for(int i = 0; i<4; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 4); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){ + double returnValue = 0; + const unsigned int sse_iters = num_points / 2; + + const double* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; + __m128d accumulator = _mm_setzero_pd(); + __m128d aVal = _mm_setzero_pd(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm_loadu_pd(aPtr); + accumulator = _mm_add_pd(accumulator, aVal); + aPtr += 2; + } + + _mm_storeu_pd((double*)tempBuffer,accumulator); + + for(int i = 0; i<2; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 2); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){ + const double* aPtr = inputBuffer; + double returnValue = 0; + + for(unsigned int number = 0;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H +#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){ + double returnValue = 0; + const unsigned int sse_iters = num_points / 4; + + const double* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; + __m256d accumulator = _mm256_setzero_pd(); + __m256d aVal = _mm256_setzero_pd(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm256_load_pd(aPtr); + accumulator = _mm256_add_pd(accumulator, aVal); + aPtr += 4; + } + + _mm256_store_pd((double*)tempBuffer,accumulator); + + for(int i = 0; i<4; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 4); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){ + double returnValue = 0; + const unsigned int sse_iters = num_points / 2; + + const double* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; + __m128d accumulator = _mm_setzero_pd(); + __m128d aVal = _mm_setzero_pd(); + + for(unsigned int number = 0; number < sse_iters; number++) + { + aVal = _mm_load_pd(aPtr); + accumulator = _mm_add_pd(accumulator, aVal); + aPtr += 2; + } + + _mm_store_pd((double*)tempBuffer,accumulator); + + for(int i = 0; i<2; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 2); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){ + const double* aPtr = inputBuffer; + double returnValue = 0; + + for(unsigned int number = 0;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h new file mode 100644 index 000000000..c9079b652 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h @@ -0,0 +1,183 @@ +/*! + * \file volk_gnsssdr_8i_accumulator_s8i.h + * \brief Volk protokernel: 8 bits (char) scalar accumulator + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that implements an accumulator of char values + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H +#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){ + char returnValue = 0; + const unsigned int sse_iters = num_points / 16; + + const char* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; + __m128i accumulator = _mm_setzero_si128(); + __m128i aVal = _mm_setzero_si128(); + + for(unsigned int number = 0; number < sse_iters; number++){ + aVal = _mm_lddqu_si128((__m128i*)aPtr); + accumulator = _mm_add_epi8(accumulator, aVal); + aPtr += 16; + } + _mm_storeu_si128((__m128i*)tempBuffer,accumulator); + + for(int i = 0; i<16; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 16); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){ + const char* aPtr = inputBuffer; + char returnValue = 0; + + for(unsigned int number = 0;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H +#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){ + char returnValue = 0; + const unsigned int sse_iters = num_points / 16; + + const char* aPtr = inputBuffer; + + __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; + __m128i accumulator = _mm_setzero_si128(); + __m128i aVal = _mm_setzero_si128(); + + for(unsigned int number = 0; number < sse_iters; number++){ + aVal = _mm_load_si128((__m128i*)aPtr); + accumulator = _mm_add_epi8(accumulator, aVal); + aPtr += 16; + } + _mm_store_si128((__m128i*)tempBuffer,accumulator); + + for(int i = 0; i<16; ++i){ + returnValue += tempBuffer[i]; + } + + for(int i = 0; i<(num_points % 16); ++i){ + returnValue += (*aPtr++); + } + + *result = returnValue; +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){ + const char* aPtr = inputBuffer; + char returnValue = 0; + + for(unsigned int number = 0;number < num_points; number++){ + returnValue += (*aPtr++); + } + *result = returnValue; +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points); +static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){ + + short res = 0; + char* resc = (char*)&res; + resc++; + + volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points); + + *result = *resc; +} +#endif /* LV_HAVE_ORC */ + +#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H */ + diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h new file mode 100644 index 000000000..0bb85a1dc --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h @@ -0,0 +1,493 @@ +/*! + * \file volk_gnsssdr_8i_index_max_16u.h + * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H +#define INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include "immintrin.h" +/*! + \brief Returns the index of the max value in src0 + \param target The index of the max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 32; + + char* basePtr = (char*)src0; + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned int index = 0; + __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __m256i ones, compareResults, currentValues; + __m128i compareResultslo, compareResultshi, maxValues, lo, hi; + + ones = _mm256_set1_epi8(0xFF); + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); + + lo = _mm256_castsi256_si128(currentValues); + hi = _mm256_extractf128_si256(currentValues,1); + + compareResultslo = _mm_cmpgt_epi8(maxValues, lo); + compareResultshi = _mm_cmpgt_epi8(maxValues, hi); + + //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h + compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); + + if (!_mm256_testc_si256(compareResults, ones)) + { + _mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues); + + for(int i = 0; i < 32; i++) + { + if(currentValuesBuffer[i] > max) + { + index = inputPtr - basePtr + i; + max = currentValuesBuffer[i]; + } + } + maxValues = _mm_set1_epi8(max); + } + + inputPtr += 32; + } + + for(int i = 0; i<(num_points % 32); ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_AVX*/ + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Returns the index of the max value in src0 + \param target The index of the max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 16; + + char* basePtr = (char*)src0; + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned int index = 0; + __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_lddqu_si128((__m128i*)inputPtr); + + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + + if (!_mm_test_all_ones(compareResults)) + { + _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); + + for(int i = 0; i < 16; i++) + { + if(currentValuesBuffer[i] > max) + { + index = inputPtr - basePtr + i; + max = currentValuesBuffer[i]; + } + } + maxValues = _mm_set1_epi8(max); + } + + inputPtr += 16; + } + + for(int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Returns the index of the max value in src0 + \param target The index of the max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 16; + + char* basePtr = (char*)src0; + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned int index = 0; + unsigned short mask; + __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_loadu_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + mask = _mm_movemask_epi8(compareResults); + + if (mask != 0xFFFF) + { + _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); + mask = ~mask; + int i = 0; + while (mask > 0) + { + if ((mask & 1) == 1) + { + if(currentValuesBuffer[i] > max) + { + index = inputPtr - basePtr + i; + max = currentValuesBuffer[i]; + } + } + i++; + mask >>= 1; + } + maxValues = _mm_set1_epi8(max); + } + inputPtr += 16; + } + + for(int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_SSE2*/ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Returns the index of the max value in src0 + \param target The index of the max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) { + + if(num_points > 0) + { + char max = src0[0]; + unsigned int index = 0; + + for(unsigned int i = 1; i < num_points; ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H*/ + + +#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H +#define INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include "immintrin.h" +/*! + \brief Returns the index of the max value in src0 + \param target The index of the max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 32; + + char* basePtr = (char*)src0; + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned int index = 0; + __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __m256i ones, compareResults, currentValues; + __m128i compareResultslo, compareResultshi, maxValues, lo, hi; + + ones = _mm256_set1_epi8(0xFF); + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm256_load_si256((__m256i*)inputPtr); + + lo = _mm256_castsi256_si128(currentValues); + hi = _mm256_extractf128_si256(currentValues,1); + + compareResultslo = _mm_cmpgt_epi8(maxValues, lo); + compareResultshi = _mm_cmpgt_epi8(maxValues, hi); + + //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h + compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); + + if (!_mm256_testc_si256(compareResults, ones)) + { + _mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues); + + for(int i = 0; i < 32; i++) + { + if(currentValuesBuffer[i] > max) + { + index = inputPtr - basePtr + i; + max = currentValuesBuffer[i]; + } + } + maxValues = _mm_set1_epi8(max); + } + + inputPtr += 32; + } + + for(int i = 0; i<(num_points % 32); ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_AVX*/ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "emmintrin.h" +/*! + \brief Returns the index of the max value in src0 + \param target The index of the max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 16; + + char* basePtr = (char*)src0; + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned int index = 0; + __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_load_si128((__m128i*)inputPtr); + + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + + if (!_mm_test_all_ones(compareResults)) + { + _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); + + for(int i = 0; i < 16; i++) + { + if(currentValuesBuffer[i] > max) + { + index = inputPtr - basePtr + i; + max = currentValuesBuffer[i]; + } + } + maxValues = _mm_set1_epi8(max); + } + + inputPtr += 16; + } + + for(int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Returns the index of the max value in src0 + \param target The index of the max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 16; + + char* basePtr = (char*)src0; + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned int index = 0; + unsigned short mask; + __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_load_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + mask = _mm_movemask_epi8(compareResults); + + if (mask != 0xFFFF) + { + _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); + mask = ~mask; + int i = 0; + while (mask > 0) + { + if ((mask & 1) == 1) + { + if(currentValuesBuffer[i] > max) + { + index = inputPtr - basePtr + i; + max = currentValuesBuffer[i]; + } + } + i++; + mask >>= 1; + } + maxValues = _mm_set1_epi8(max); + } + inputPtr += 16; + } + + for(int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_SSE2*/ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Returns the index of the max value in src0 + \param target The index of the max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_index_max_16u_a_generic(unsigned int* target, const char* src0, unsigned int num_points) { + + if(num_points > 0) + { + char max = src0[0]; + unsigned int index = 0; + + for(unsigned int i = 1; i < num_points; ++i) + { + if(src0[i] > max) + { + index = i; + max = src0[i]; + } + } + target[0] = index; + } +} + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h new file mode 100644 index 000000000..ef362fd57 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h @@ -0,0 +1,327 @@ +/*! + * \file volk_gnsssdr_8i_max_s8i.h + * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_u_H +#define INCLUDED_volk_gnsssdr_8i_max_s8i_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include +/*! + \brief Returns the max value in src0 + \param target The max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 16; + + char* inputPtr = (char*)src0; + char max = src0[0]; + __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_loadu_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); + inputPtr += 16; + } + + _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); + + for(int i = 0; i<16; ++i) + { + if(maxValuesBuffer[i] > max) + { + max = maxValuesBuffer[i]; + } + } + + for(int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Returns the max value in src0 + \param target The max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 16; + + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned short mask; + __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_loadu_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + mask = _mm_movemask_epi8(compareResults); + + if (mask != 0xFFFF) + { + _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); + mask = ~mask; + int i = 0; + while (mask > 0) + { + if ((mask & 1) == 1) + { + if(currentValuesBuffer[i] > max) + { + max = currentValuesBuffer[i]; + } + } + i++; + mask >>= 1; + } + maxValues = _mm_set1_epi8(max); + } + inputPtr += 16; + } + + for(int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; + } +} + +#endif /*LV_HAVE_SSE2*/ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Returns the max value in src0 + \param target The max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) { + if(num_points > 0) + { + char max = src0[0]; + + for(unsigned int i = 1; i < num_points; ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; + } +} + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_u_H*/ + + +#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_a_H +#define INCLUDED_volk_gnsssdr_8i_max_s8i_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Returns the max value in src0 + \param target The max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 16; + + char* inputPtr = (char*)src0; + char max = src0[0]; + __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_load_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); + inputPtr += 16; + } + + _mm_store_si128((__m128i*)maxValuesBuffer, maxValues); + + for(int i = 0; i<16; ++i) + { + if(maxValuesBuffer[i] > max) + { + max = maxValuesBuffer[i]; + } + } + + for(int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; + } +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Returns the max value in src0 + \param target The max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) { + if(num_points > 0){ + const unsigned int sse_iters = num_points / 16; + + char* inputPtr = (char*)src0; + char max = src0[0]; + unsigned short mask; + __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __m128i maxValues, compareResults, currentValues; + + maxValues = _mm_set1_epi8(max); + + for(unsigned int number = 0; number < sse_iters; number++) + { + currentValues = _mm_load_si128((__m128i*)inputPtr); + compareResults = _mm_cmpgt_epi8(maxValues, currentValues); + mask = _mm_movemask_epi8(compareResults); + + if (mask != 0xFFFF) + { + _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); + mask = ~mask; + int i = 0; + while (mask > 0) + { + if ((mask & 1) == 1) + { + if(currentValuesBuffer[i] > max) + { + max = currentValuesBuffer[i]; + } + } + i++; + mask >>= 1; + } + maxValues = _mm_set1_epi8(max); + } + inputPtr += 16; + } + + for(int i = 0; i<(num_points % 16); ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; + } +} + +#endif /*LV_HAVE_SSE2*/ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Returns the max value in src0 + \param target The max value in src0 + \param src0 The buffer of data to be analysed + \param num_points The number of values in src0 to be analysed + */ +static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) { + if(num_points > 0) + { + if(num_points > 0) + { + char max = src0[0]; + + for(unsigned int i = 1; i < num_points; ++i) + { + if(src0[i] > max) + { + max = src0[i]; + } + } + target = max; + } + } +} + +#endif /*LV_HAVE_GENERIC*/ + +#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_a_H*/ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h new file mode 100644 index 000000000..4a2bd5ab2 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h @@ -0,0 +1,184 @@ +/*! + * \file volk_gnsssdr_8i_x2_add_8i.h + * \brief Volk protokernel: adds pairs of 8 bits (char) scalars + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that adds pairs of 8 bits (char) scalars + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H +#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include "pmmintrin.h" +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector + */ +static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 16; + + char* cPtr = cVector; + const char* aPtr = aVector; + const char* bPtr= bVector; + + __m128i aVal, bVal, cVal; + + for(int number = 0; number < sse_iters; number++){ + + aVal = _mm_lddqu_si128((__m128i*)aPtr); + bVal = _mm_lddqu_si128((__m128i*)bPtr); + + cVal = _mm_add_epi8(aVal, bVal); + + _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container + + aPtr += 16; + bPtr += 16; + cPtr += 16; + } + + for(int i = 0; i<(num_points % 16); ++i) + { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector + */ +static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ + char* cPtr = cVector; + const char* aPtr = aVector; + const char* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H +#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H + +#include +#include + +#ifdef LV_HAVE_SSE2 +#include "pmmintrin.h" +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector + */ +static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 16; + + char* cPtr = cVector; + const char* aPtr = aVector; + const char* bPtr= bVector; + + __m128i aVal, bVal, cVal; + + for(int number = 0; number < sse_iters; number++){ + + aVal = _mm_load_si128((__m128i*)aPtr); + bVal = _mm_load_si128((__m128i*)bPtr); + + cVal = _mm_add_epi8(aVal, bVal); + + _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container + + aPtr += 16; + bPtr += 16; + cPtr += 16; + } + + for(int i = 0; i<(num_points % 16); ++i) + { + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector + */ +static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ + char* cPtr = cVector; + const char* aPtr = aVector; + const char* bPtr= bVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) + (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Adds the two input vectors and store their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be added + \param bVector One of the vectors to be added + \param num_points The number of values in aVector and bVector to be added together and stored into cVector + */ +extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points); +static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ + volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + +#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h new file mode 100644 index 000000000..231796274 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h @@ -0,0 +1,326 @@ +/*! + * \file volk_gnsssdr_8ic_conjugate_8ic.h + * \brief Volk protokernel: calculates the conjugate of a 16 bits vector + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that calculates the conjugate of a + * 16 bits vector (8 bits the real part and 8 bits the imaginary part) + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H +#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include "immintrin.h" +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + const unsigned int sse_iters = num_points / 16; + + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + + __m256 tmp; + __m128i tmp128lo, tmp128hi; + __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); + __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); + + for (int i = 0; i < sse_iters; ++i) + { + tmp = _mm256_loadu_ps((float*)a); + tmp = _mm256_xor_ps(tmp, conjugator1); + tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); + tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); + tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); + tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); + //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h + tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1); + _mm256_storeu_ps((float*)c, tmp); + + a += 16; + c += 16; + } + + for (int i = 0; i<(num_points % 16); ++i) + { + *c++ = lv_conj(*a++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSSE3 +#include "tmmintrin.h" +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + const unsigned int sse_iters = num_points / 8; + + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + __m128i tmp; + + __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); + + for (int i = 0; i < sse_iters; ++i) + { + tmp = _mm_lddqu_si128((__m128i*)a); + tmp = _mm_sign_epi8(tmp, conjugator); + _mm_storeu_si128((__m128i*)c, tmp); + a += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = lv_conj(*a++); + } + +} +#endif /* LV_HAVE_SSSE3 */ + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + const unsigned int sse_iters = num_points / 8; + + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + __m128i tmp; + + __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); + + for (int i = 0; i < sse_iters; ++i) + { + tmp = _mm_lddqu_si128((__m128i*)a); + tmp = _mm_xor_si128(tmp, conjugator1); + tmp = _mm_add_epi8(tmp, conjugator2); + _mm_storeu_si128((__m128i*)c, tmp); + a += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = lv_conj(*a++); + } + +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + lv_8sc_t* cPtr = cVector; + const lv_8sc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H +#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include "immintrin.h" +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + const unsigned int sse_iters = num_points / 16; + + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + + __m256 tmp; + __m128i tmp128lo, tmp128hi; + __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); + __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); + + for (int i = 0; i < sse_iters; ++i) + { + tmp = _mm256_load_ps((float*)a); + tmp = _mm256_xor_ps(tmp, conjugator1); + tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); + tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); + tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); + tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); + //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h + tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1); + _mm256_store_ps((float*)c, tmp); + + a += 16; + c += 16; + } + + for (int i = 0; i<(num_points % 16); ++i) + { + *c++ = lv_conj(*a++); + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSSE3 +#include "tmmintrin.h" +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + const unsigned int sse_iters = num_points / 8; + + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + __m128i tmp; + + __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); + + for (int i = 0; i < sse_iters; ++i) + { + tmp = _mm_load_si128((__m128i*)a); + tmp = _mm_sign_epi8(tmp, conjugator); + _mm_store_si128((__m128i*)c, tmp); + a += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = lv_conj(*a++); + } + +} +#endif /* LV_HAVE_SSSE3 */ + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + const unsigned int sse_iters = num_points / 8; + + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + __m128i tmp; + + __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); + + for (int i = 0; i < sse_iters; ++i) + { + tmp = _mm_load_si128((__m128i*)a); + tmp = _mm_xor_si128(tmp, conjugator1); + tmp = _mm_add_epi8(tmp, conjugator2); + _mm_store_si128((__m128i*)c, tmp); + a += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = lv_conj(*a++); + } + +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + lv_8sc_t* cPtr = cVector; + const lv_8sc_t* aPtr = aVector; + unsigned int number = 0; + + for(number = 0; number < num_points; number++){ + *cPtr++ = lv_conj(*aPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Takes the conjugate of an unsigned char vector. + \param cVector The vector where the results will be stored + \param aVector Vector to be conjugated + \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector + */ +extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points); +static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ + volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points); +} +#endif /* LV_HAVE_ORC */ + +#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h new file mode 100644 index 000000000..1eab648fe --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h @@ -0,0 +1,320 @@ +/*! + * \file volk_gnsssdr_8ic_magnitude_squared_8i.h + * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that calculates the magnitude squared of a + * 16 bits vector (8 bits the real part and 8 bits the imaginary part) + * result = (real*real) + (imag*imag) + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H +#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include "tmmintrin.h" +/*! + \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 16; + + const char* complexVectorPtr = (char*)complexVector; + char* magnitudeVectorPtr = magnitudeVector; + + __m128i zero, result8; + __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; + __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; + + zero = _mm_setzero_si128(); + maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + for(int number = 0;number < sse_iters; number++) + { + avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); + avectorlo = _mm_unpacklo_epi8 (avector, zero); + avectorhi = _mm_unpackhi_epi8 (avector, zero); + avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); + avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); + aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); + + complexVectorPtr += 16; + + bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); + bvectorlo = _mm_unpacklo_epi8 (bvector, zero); + bvectorhi = _mm_unpackhi_epi8 (bvector, zero); + bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); + bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); + badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); + + complexVectorPtr += 16; + + result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); + + _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8); + + magnitudeVectorPtr += 16; + + + } + + for (int i = 0; i<(num_points % 16); ++i) + { + const char valReal = *complexVectorPtr++; + const char valImag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); + } +} +#endif /* LV_HAVE_SSE3 */ + +//#ifdef LV_HAVE_SSE +//#include +///*! +// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +// \param complexVector The vector containing the complex input values +// \param magnitudeVector The vector containing the real output values +// \param num_points The number of complex values in complexVector to be calculated and stored into cVector +// */ +//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +// unsigned int number = 0; +// const unsigned int quarterPoints = num_points / 4; +// +// const float* complexVectorPtr = (float*)complexVector; +// float* magnitudeVectorPtr = magnitudeVector; +// +// __m128 cplxValue1, cplxValue2, iValue, qValue, result; +// for(;number < quarterPoints; number++){ +// cplxValue1 = _mm_loadu_ps(complexVectorPtr); +// complexVectorPtr += 4; +// +// cplxValue2 = _mm_loadu_ps(complexVectorPtr); +// complexVectorPtr += 4; +// +// // Arrange in i1i2i3i4 format +// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +// // Arrange in q1q2q3q4 format +// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +// +// iValue = _mm_mul_ps(iValue, iValue); // Square the I values +// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +// +// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +// +// _mm_storeu_ps(magnitudeVectorPtr, result); +// magnitudeVectorPtr += 4; +// } +// +// number = quarterPoints * 4; +// for(; number < num_points; number++){ +// float val1Real = *complexVectorPtr++; +// float val1Imag = *complexVectorPtr++; +// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +// } +//} +//#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ + const char* complexVectorPtr = (char*)complexVector; + char* magnitudeVectorPtr = magnitudeVector; + + for(int number = 0; number < num_points; number++){ + const char real = *complexVectorPtr++; + const char imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H +#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 16; + + const char* complexVectorPtr = (char*)complexVector; + char* magnitudeVectorPtr = magnitudeVector; + + __m128i zero, result8; + __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; + __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; + + zero = _mm_setzero_si128(); + maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); + maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); + + for(int number = 0;number < sse_iters; number++) + { + avector = _mm_load_si128((__m128i*)complexVectorPtr); + avectorlo = _mm_unpacklo_epi8 (avector, zero); + avectorhi = _mm_unpackhi_epi8 (avector, zero); + avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); + avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); + aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); + + complexVectorPtr += 16; + + bvector = _mm_load_si128((__m128i*)complexVectorPtr); + bvectorlo = _mm_unpacklo_epi8 (bvector, zero); + bvectorhi = _mm_unpackhi_epi8 (bvector, zero); + bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); + bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); + badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); + + complexVectorPtr += 16; + + result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); + + _mm_store_si128((__m128i*)magnitudeVectorPtr, result8); + + magnitudeVectorPtr += 16; + + + } + + for (int i = 0; i<(num_points % 16); ++i) + { + const char valReal = *complexVectorPtr++; + const char valImag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); + } +} +#endif /* LV_HAVE_SSE3 */ + +//#ifdef LV_HAVE_SSE +//#include +///*! +// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +// \param complexVector The vector containing the complex input values +// \param magnitudeVector The vector containing the real output values +// \param num_points The number of complex values in complexVector to be calculated and stored into cVector +// */ +//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +// unsigned int number = 0; +// const unsigned int quarterPoints = num_points / 4; +// +// const float* complexVectorPtr = (float*)complexVector; +// float* magnitudeVectorPtr = magnitudeVector; +// +// __m128 cplxValue1, cplxValue2, iValue, qValue, result; +// for(;number < quarterPoints; number++){ +// cplxValue1 = _mm_load_ps(complexVectorPtr); +// complexVectorPtr += 4; +// +// cplxValue2 = _mm_load_ps(complexVectorPtr); +// complexVectorPtr += 4; +// +// // Arrange in i1i2i3i4 format +// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +// // Arrange in q1q2q3q4 format +// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +// +// iValue = _mm_mul_ps(iValue, iValue); // Square the I values +// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +// +// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +// +// _mm_store_ps(magnitudeVectorPtr, result); +// magnitudeVectorPtr += 4; +// } +// +// number = quarterPoints * 4; +// for(; number < num_points; number++){ +// float val1Real = *complexVectorPtr++; +// float val1Imag = *complexVectorPtr++; +// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +// } +//} +//#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ + const char* complexVectorPtr = (char*)complexVector; + char* magnitudeVectorPtr = magnitudeVector; + + for(int number = 0; number < num_points; number++){ + const char real = *complexVectorPtr++; + const char imag = *complexVectorPtr++; + *magnitudeVectorPtr++ = (real*real) + (imag*imag); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector + \param complexVector The vector containing the complex input values + \param magnitudeVector The vector containing the real output values + \param num_points The number of complex values in complexVector to be calculated and stored into cVector + */ +extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points); +static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ + volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points); +} +#endif /* LV_HAVE_ORC */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h new file mode 100644 index 000000000..e0578f13a --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h @@ -0,0 +1,271 @@ +/*! + * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h + * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that multiplies a group of 16 bits vectors + * (8 bits the real part and 8 bits the imaginary part) by one constant vector + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H +#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector + */ +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; + + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + y = _mm_set1_epi16 (*(short*)&scalar); + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + for(int number = 0;number < sse_iters; number++){ + + x = _mm_lddqu_si128((__m128i*)a); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + realc = _mm_and_si128 (realc, mult1); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_and_si128 (imagc, mult1); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_or_si128 (realc, imagc); + + _mm_storeu_si128((__m128i*)c, totalc); + + a += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = (*a++) * scalar; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector + */ +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ + + /*lv_8sc_t* cPtr = cVector; + const lv_8sc_t* aPtr = aVector; + + for (int i = 0; i= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H +#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector + */ +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; + + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + y = _mm_set1_epi16 (*(short*)&scalar); + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + for(int number = 0;number < sse_iters; number++){ + + x = _mm_load_si128((__m128i*)a); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + realc = _mm_and_si128 (realc, mult1); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_and_si128 (imagc, mult1); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_or_si128 (realc, imagc); + + _mm_store_si128((__m128i*)c, totalc); + + a += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = (*a++) * scalar; + } + +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector + */ +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ + + /*lv_8sc_t* cPtr = cVector; + const lv_8sc_t* aPtr = aVector; + + for (int i = 0; i= 8){ + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + *cPtr++ = (*aPtr++) * scalar; + number -= 8; + } + + // clean up any remaining + while (number-- > 0) + *cPtr++ = *aPtr++ * scalar; +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Multiplies the input vector by a scalar and stores the results in the third vector + \param cVector The vector where the results will be stored + \param aVector The vector to be multiplied + \param scalar The complex scalar to multiply aVector + \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector + */ +extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points); +static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ + volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points); +} +#endif /* LV_HAVE_ORC */ + +#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h new file mode 100644 index 000000000..696b0a31f --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h @@ -0,0 +1,499 @@ +/*! + * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h + * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part + * and 8 bits the imaginary part) and accumulates them + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H +#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H + +#include +#include +#include +#include + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector + \param cVector The vector where the accumulated result will be stored + \param aVector One of the vectors to be multiplied and accumulated + \param bVector One of the vectors to be multiplied and accumulated + \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { + + /*lv_8sc_t* cPtr = result; + const lv_8sc_t* aPtr = input; + const lv_8sc_t* bPtr = taps; + + for(int number = 0; number < num_points; number++){ + *cPtr += (*aPtr++) * (*bPtr++); + }*/ + + char * res = (char*) result; + char * in = (char*) input; + char * tp = (char*) taps; + unsigned int n_2_ccomplex_blocks = num_points/2; + unsigned int isodd = num_points & 1; + + char sum0[2] = {0,0}; + char sum1[2] = {0,0}; + unsigned int i = 0; + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + in += 4; + tp += 4; + } + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + // Cleanup if we had an odd number of points + for(i = 0; i < isodd; ++i) { + *result += input[num_points - 1] * taps[num_points - 1]; + } +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector + \param cVector The vector where the accumulated result will be stored + \param aVector One of the vectors to be multiplied and accumulated + \param bVector One of the vectors to be multiplied and accumulated + \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { + + lv_8sc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(char)); + + const lv_8sc_t* a = input; + const lv_8sc_t* b = taps; + + const unsigned int sse_iters = num_points/8; + + if (sse_iters>0) + { + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + for(int number = 0; number < sse_iters; number++){ + + x = _mm_lddqu_si128((__m128i*)a); + y = _mm_lddqu_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + realcacc = _mm_add_epi16 (realcacc, realc); + imagcacc = _mm_add_epi16 (imagcacc, imagc); + + a += 8; + b += 8; + } + + realcacc = _mm_and_si128 (realcacc, mult1); + imagcacc = _mm_and_si128 (imagcacc, mult1); + imagcacc = _mm_slli_si128 (imagcacc, 1); + + totalc = _mm_or_si128 (realcacc, imagcacc); + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + + _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector + + for (int i = 0; i<8; ++i) + { + dotProduct += dotProductVector[i]; + } + } + + for (int i = 0; i<(num_points % 8); ++i) + { + dotProduct += (*a++) * (*b++); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE2*/ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector + \param cVector The vector where the accumulated result will be stored + \param aVector One of the vectors to be multiplied and accumulated + \param bVector One of the vectors to be multiplied and accumulated + \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { + + lv_8sc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(char)); + + const lv_8sc_t* a = input; + const lv_8sc_t* b = taps; + + const unsigned int sse_iters = num_points/8; + + if (sse_iters>0) + { + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + for(int number = 0; number < sse_iters; number++){ + + x = _mm_lddqu_si128((__m128i*)a); + y = _mm_lddqu_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + realcacc = _mm_add_epi16 (realcacc, realc); + imagcacc = _mm_add_epi16 (imagcacc, imagc); + + a += 8; + b += 8; + } + + imagcacc = _mm_slli_si128 (imagcacc, 1); + + totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + + _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector + + for (int i = 0; i<8; ++i) + { + dotProduct += dotProductVector[i]; + } + } + + for (int i = 0; i<(num_points % 8); ++i) + { + dotProduct += (*a++) * (*b++); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H*/ + + +#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H +#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H + +#include +#include +#include +#include + + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector + \param cVector The vector where the accumulated result will be stored + \param aVector One of the vectors to be multiplied and accumulated + \param bVector One of the vectors to be multiplied and accumulated + \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { + + /*lv_8sc_t* cPtr = result; + const lv_8sc_t* aPtr = input; + const lv_8sc_t* bPtr = taps; + + for(int number = 0; number < num_points; number++){ + *cPtr += (*aPtr++) * (*bPtr++); + }*/ + + char * res = (char*) result; + char * in = (char*) input; + char * tp = (char*) taps; + unsigned int n_2_ccomplex_blocks = num_points/2; + unsigned int isodd = num_points & 1; + + char sum0[2] = {0,0}; + char sum1[2] = {0,0}; + unsigned int i = 0; + + for(i = 0; i < n_2_ccomplex_blocks; ++i) { + sum0[0] += in[0] * tp[0] - in[1] * tp[1]; + sum0[1] += in[0] * tp[1] + in[1] * tp[0]; + sum1[0] += in[2] * tp[2] - in[3] * tp[3]; + sum1[1] += in[2] * tp[3] + in[3] * tp[2]; + + in += 4; + tp += 4; + } + + res[0] = sum0[0] + sum1[0]; + res[1] = sum0[1] + sum1[1]; + + // Cleanup if we had an odd number of points + for(i = 0; i < isodd; ++i) { + *result += input[num_points - 1] * taps[num_points - 1]; + } +} + +#endif /*LV_HAVE_GENERIC*/ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector + \param cVector The vector where the accumulated result will be stored + \param aVector One of the vectors to be multiplied and accumulated + \param bVector One of the vectors to be multiplied and accumulated + \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { + + lv_8sc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(char)); + + const lv_8sc_t* a = input; + const lv_8sc_t* b = taps; + + const unsigned int sse_iters = num_points/8; + + if (sse_iters>0) + { + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + for(int number = 0; number < sse_iters; number++){ + + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + realcacc = _mm_add_epi16 (realcacc, realc); + imagcacc = _mm_add_epi16 (imagcacc, imagc); + + a += 8; + b += 8; + } + + realcacc = _mm_and_si128 (realcacc, mult1); + imagcacc = _mm_and_si128 (imagcacc, mult1); + imagcacc = _mm_slli_si128 (imagcacc, 1); + + totalc = _mm_or_si128 (realcacc, imagcacc); + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + + _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector + + for (int i = 0; i<8; ++i) + { + dotProduct += dotProductVector[i]; + } + } + + for (int i = 0; i<(num_points % 8); ++i) + { + dotProduct += (*a++) * (*b++); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE2*/ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector + \param cVector The vector where the accumulated result will be stored + \param aVector One of the vectors to be multiplied and accumulated + \param bVector One of the vectors to be multiplied and accumulated + \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { + + lv_8sc_t dotProduct; + memset(&dotProduct, 0x0, 2*sizeof(char)); + + const lv_8sc_t* a = input; + const lv_8sc_t* b = taps; + + const unsigned int sse_iters = num_points/8; + + if (sse_iters>0) + { + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + realcacc = _mm_setzero_si128(); + imagcacc = _mm_setzero_si128(); + + for(int number = 0; number < sse_iters; number++){ + + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + realcacc = _mm_add_epi16 (realcacc, realc); + imagcacc = _mm_add_epi16 (imagcacc, imagc); + + a += 8; + b += 8; + } + + imagcacc = _mm_slli_si128 (imagcacc, 1); + + totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + + _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector + + for (int i = 0; i<8; ++i) + { + dotProduct += dotProductVector[i]; + } + } + + for (int i = 0; i<(num_points % 8); ++i) + { + dotProduct += (*a++) * (*b++); + } + + *result = dotProduct; +} + +#endif /*LV_HAVE_SSE4_1*/ + +#ifdef LV_HAVE_ORC +/*! + \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector + \param cVector The vector where the accumulated result will be stored + \param aVector One of the vectors to be multiplied and accumulated + \param bVector One of the vectors to be multiplied and accumulated + \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector + */ +extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points); +static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){ + + short resReal = 0; + char* resRealChar = (char*)&resReal; + resRealChar++; + + short resImag = 0; + char* resImagChar = (char*)&resImag; + resImagChar++; + + volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points); + + *result = lv_cmake(*resRealChar, *resImagChar); +} +#endif /* LV_HAVE_ORC */ + +#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h new file mode 100644 index 000000000..f8af2eb82 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h @@ -0,0 +1,346 @@ +/*! + * \file volk_gnsssdr_8ic_x2_multiply_8ic.h + * \brief Volk protokernel: multiplies two 16 bits vectors + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part + * and 8 bits the imaginary part) + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H +#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + for(int number = 0;number < sse_iters; number++){ + + x = _mm_lddqu_si128((__m128i*)a); + y = _mm_lddqu_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + realc = _mm_and_si128 (realc, mult1); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_and_si128 (imagc, mult1); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_or_si128 (realc, imagc); + + _mm_storeu_si128((__m128i*)c, totalc); + + a += 8; + b += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, zero; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + + zero = _mm_setzero_si128(); + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + for(int number = 0;number < sse_iters; number++){ + + x = _mm_lddqu_si128((__m128i*)a); + y = _mm_lddqu_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_blendv_epi8 (imagc, realc, mult1); + + _mm_storeu_si128((__m128i*)c, totalc); + + a += 8; + b += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + lv_8sc_t* cPtr = cVector; + const lv_8sc_t* aPtr = aVector; + const lv_8sc_t* bPtr = bVector; + + for(int number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H +#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H + +#include +#include +#include + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + for(int number = 0;number < sse_iters; number++){ + + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + realc = _mm_and_si128 (realc, mult1); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_and_si128 (imagc, mult1); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_or_si128 (realc, imagc); + + _mm_store_si128((__m128i*)c, totalc); + + a += 8; + b += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, zero; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; + lv_8sc_t* c = cVector; + const lv_8sc_t* a = aVector; + const lv_8sc_t* b = bVector; + + zero = _mm_setzero_si128(); + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + for(int number = 0;number < sse_iters; number++){ + + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + imagc = _mm_slli_si128 (imagc, 1); + + totalc = _mm_blendv_epi8 (imagc, realc, mult1); + + _mm_store_si128((__m128i*)c, totalc); + + a += 8; + b += 8; + c += 8; + } + + for (int i = 0; i<(num_points % 8); ++i) + { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + lv_8sc_t* cPtr = cVector; + const lv_8sc_t* aPtr = aVector; + const lv_8sc_t* bPtr = bVector; + + for(int number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } + +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Multiplies the two input complex vectors and stores their results in the third vector + \param cVector The vector where the results will be stored + \param aVector One of the vectors to be multiplied + \param bVector One of the vectors to be multiplied + \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector + */ +extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points); +static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ + volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + +#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h new file mode 100644 index 000000000..b231f0f85 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h @@ -0,0 +1,613 @@ +/*! + * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the + * real part and 8 bits the imaginary part), and accumulates the result + * in 32 bits single point values, returning float32 values: + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 16 bits vectors) It returns the input + * signal in base band (BB) + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 E_code_acc, P_code_acc, L_code_acc; + __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; + __m128 output_ps; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + E_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 E_code_acc, P_code_acc, L_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 output_ps_1, output_ps_2; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + E_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + lv_8sc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + } +} + +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 E_code_acc, P_code_acc, L_code_acc; + __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; + __m128 output_ps; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + E_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 E_code_acc, P_code_acc, L_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 output_ps_1, output_ps_2; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + E_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + lv_8sc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + } +} + +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h new file mode 100644 index 000000000..b58931d8a --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h @@ -0,0 +1,874 @@ +/*! + * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the + * real part and 8 bits the imaginary part): + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 16 bits vectors) It returns the input + * signal in base band (BB) + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 16 bits vectors), accumulating the results + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 16 bits vectors), accumulating the results + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 16 bits vectors), accumulating the results + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" + /*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_8sc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_8sc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_8sc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_si128(); + imag_E_code_acc = _mm_setzero_si128(); + real_L_code_acc = _mm_setzero_si128(); + imag_L_code_acc = _mm_setzero_si128(); + real_P_code_acc = _mm_setzero_si128(); + imag_P_code_acc = _mm_setzero_si128(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); + imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); + imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); + imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; + + imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); + output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); + _mm_storeu_si128((__m128i*)E_dotProductVector, output); + + imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); + output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); + _mm_storeu_si128((__m128i*)L_dotProductVector, output); + + imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); + output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); + _mm_storeu_si128((__m128i*)P_dotProductVector, output); + + for (int i = 0; i<8; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += bb_signal_sample * (*E_code_ptr++); + *P_out_ptr += bb_signal_sample * (*P_code_ptr++); + *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + } +} + +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_8sc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_8sc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_8sc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_si128(); + imag_E_code_acc = _mm_setzero_si128(); + real_L_code_acc = _mm_setzero_si128(); + imag_L_code_acc = _mm_setzero_si128(); + real_P_code_acc = _mm_setzero_si128(); + imag_P_code_acc = _mm_setzero_si128(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); + imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); + imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); + imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; + + real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); + imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); + imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); + output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); + _mm_storeu_si128((__m128i*)E_dotProductVector, output); + + real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); + imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); + imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); + output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); + _mm_storeu_si128((__m128i*)L_dotProductVector, output); + + real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); + imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); + imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); + output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); + _mm_storeu_si128((__m128i*)P_dotProductVector, output); + + for (int i = 0; i<8; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += bb_signal_sample * (*E_code_ptr++); + *P_out_ptr += bb_signal_sample * (*P_code_ptr++); + *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + } +} + +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + lv_8sc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_8sc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_8sc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_8sc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_si128(); + imag_E_code_acc = _mm_setzero_si128(); + real_L_code_acc = _mm_setzero_si128(); + imag_L_code_acc = _mm_setzero_si128(); + real_P_code_acc = _mm_setzero_si128(); + imag_P_code_acc = _mm_setzero_si128(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); + imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); + imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); + imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; + + imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); + output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); + _mm_store_si128((__m128i*)E_dotProductVector, output); + + imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); + output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); + _mm_store_si128((__m128i*)L_dotProductVector, output); + + imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); + output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); + _mm_store_si128((__m128i*)P_dotProductVector, output); + + for (int i = 0; i<8; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += bb_signal_sample * (*E_code_ptr++); + *P_out_ptr += bb_signal_sample * (*P_code_ptr++); + *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + } +} + +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* E_code_ptr = E_code; + lv_8sc_t* E_out_ptr = E_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_8sc_t* L_out_ptr = L_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_8sc_t* P_out_ptr = P_out; + + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_E_code_acc = _mm_setzero_si128(); + imag_E_code_acc = _mm_setzero_si128(); + real_L_code_acc = _mm_setzero_si128(); + imag_L_code_acc = _mm_setzero_si128(); + real_P_code_acc = _mm_setzero_si128(); + imag_P_code_acc = _mm_setzero_si128(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); + imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); + imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); + imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); + + input_ptr += 8; + carrier_ptr += 8; + E_code_ptr += 8; + L_code_ptr += 8; + P_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; + + real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); + imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); + imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); + output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); + _mm_store_si128((__m128i*)E_dotProductVector, output); + + real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); + imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); + imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); + output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); + _mm_store_si128((__m128i*)L_dotProductVector, output); + + real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); + imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); + imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); + output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); + _mm_store_si128((__m128i*)P_dotProductVector, output); + + for (int i = 0; i<8; ++i) + { + *E_out_ptr += E_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get early, late, and prompt values for each + *E_out_ptr += bb_signal_sample * (*E_code_ptr++); + *P_out_ptr += bb_signal_sample * (*P_code_ptr++); + *L_out_ptr += bb_signal_sample * (*L_code_ptr++); + } +} + +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +{ + lv_8sc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *E_out = 0; + *P_out = 0; + *L_out = 0; + // perform Early, Prompt and Late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get early, late, and prompt values for each + *E_out += bb_signal_sample * E_code[i]; + *P_out += bb_signal_sample * P_code[i]; + *L_out += bb_signal_sample * L_code[i]; + } +} + +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param E_code Early PRN code replica input + \param P_code Early PRN code replica input + \param L_code Early PRN code replica input + \param E_out Early correlation output + \param P_out Early correlation output + \param L_out Early correlation output + \param num_points The number of complex values in vectors + */ + +extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points); +extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points); +static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){ + + short E_out_real = 0; + short E_out_imag = 0; + char* E_out_real_c = (char*)&E_out_real; + E_out_real_c++; + char* E_out_imag_c = (char*)&E_out_imag; + E_out_imag_c++; + + short P_out_real = 0; + short P_out_imag = 0; + char* P_out_real_c = (char*)&P_out_real; + P_out_real_c++; + char* P_out_imag_c = (char*)&P_out_imag; + P_out_imag_c++; + + short L_out_real = 0; + short L_out_imag = 0; + char* L_out_real_c = (char*)&L_out_real; + L_out_real_c++; + char* L_out_imag_c = (char*)&L_out_imag; + L_out_imag_c++; + + volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points); + volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points); + + //ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that + //in one function the length of the code gives memory problems (bad access, segmentation fault). + //Also, the maximum number of accumulators that can be used is 4 (and we need 6). + //The "carrier wipe-off" step is done two times: one in the first function and another one in the second. + //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just + //one time. + + *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c); + *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c); + *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c); +} +#endif /* LV_HAVE_ORC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h new file mode 100644 index 000000000..8acaa0887 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h @@ -0,0 +1,797 @@ +/*! + * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits). + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the + * real part and 8 bits the imaginary part), and accumulates the result + * in 32 bits single point values, returning float32 values: + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 16 bits vectors) It returns the input + * signal in base band (BB) + * - Very Early values are calculated by multiplying the input signal in BB by the + * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Very Late values are calculated by multiplying the input signal in BB by the + * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * + * ------------------------------------------------------------------------- + * Bits analysis + * + * input = 8 bits + * carrier = 8 bits + * XX_code = 8 bits + * XX_out = 8 bits + * bb_signal_sample = 8 bits + * + * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) + * + * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between —7 and 7 to avoid overflow (3 bits) + * + * conclusion = input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 to avoid overflow (3 bits) + * If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 to avoid overflow (2 bits). + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; + __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; + __m128 output_ps; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + VE_code_acc = _mm_setzero_ps(); + E_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 output_ps_1, output_ps_2; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + VE_code_acc = _mm_setzero_ps(); + E_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + lv_8sc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + // perform very early, Early, Prompt, Late and very late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); + } +} + +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; + __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; + __m128 output_ps; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + VE_code_acc = _mm_setzero_ps(); + E_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps); + + //Get very late values + y = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE2 +#include "emmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 output_ps_1, output_ps_2; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + VE_code_acc = _mm_setzero_ps(); + E_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) + CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) + + CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) + + //Get very early values + y = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + //Get very late values + y = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + lv_8sc_t bb_signal_sample; + + bb_signal_sample = lv_cmake(0, 0); + + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + // perform very early, Early, Prompt, Late and very late correlation + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); + } +} + +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H */ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h new file mode 100644 index 000000000..d5289165a --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h @@ -0,0 +1,1520 @@ +/*! + * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the + * real part and 8 bits the imaginary part), and accumulates the result + * in 32 bits single point values, returning float32 values: + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 16 bits vectors) It returns the input + * signal in base band (BB) + * - Very Early values are calculated by multiplying the input signal in BB by the + * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Very Late values are calculated by multiplying the input signal in BB by the + * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; + __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; + + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 output_ps_1, output_ps_2; + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + VE_code_acc = _mm_setzero_ps(); + E_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + imagx = _mm_srli_si128 (x, 1); + imagx = _mm_and_si128 (imagx, mult1); + realx = _mm_and_si128 (x, mult1); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (realx, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); + realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); + + real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + imagy = _mm_srli_si128 (y, 1); + imagy = _mm_and_si128 (imagy, mult1); + realy = _mm_and_si128 (y, mult1); + + realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); + imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); + realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); + imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); + + real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); + imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; + __m128i mult1, output, real_output, imag_output; + + __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 output_ps_1, output_ps_2; + + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + *VE_out_ptr = 0; + *E_out_ptr = 0; + *P_out_ptr = 0; + *L_out_ptr = 0; + *VL_out_ptr = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + VE_code_acc = _mm_setzero_ps(); + E_code_acc = _mm_setzero_ps(); + P_code_acc = _mm_setzero_ps(); + L_code_acc = _mm_setzero_ps(); + VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + y_aux = _mm_sign_epi8 (y, x); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (x_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, x); + imag_output = _mm_maddubs_epi16 (x_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); + VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); + E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); + P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); + L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + output = _mm_blendv_epi8 (imag_output, real_output, mult1); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_1 = _mm_cvtepi32_ps(output_i32); + + input_i_1 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + input_i_2 = _mm_cvtepi8_epi32(output); + output = _mm_srli_si128 (output, 4); + output_i32 = _mm_add_epi32 (input_i_1, input_i_2); + output_ps_2 = _mm_cvtepi32_ps(output_i32); + + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); + VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; + + _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<2; ++i) + { + *VE_out_ptr += VE_dotProductVector[i]; + *E_out_ptr += E_dotProductVector[i]; + *P_out_ptr += P_dotProductVector[i]; + *L_out_ptr += L_dotProductVector[i]; + *VL_out_ptr += VL_dotProductVector[i]; + } + } + + lv_8sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; + __m128i mult1, real_output, imag_output; + + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; + __m128 real_output_ps, imag_output_ps; + + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); + __m128i rearrange_sequence = _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + y_aux = _mm_sign_epi8 (y, x); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (x_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, x); + imag_output = _mm_maddubs_epi16 (x_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; + __m128i real_output, imag_output; + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; + __m128 real_output_ps, imag_output_ps; + __m128i minus128control; + + __m128i minus128 = _mm_set1_epi8 (-128); + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); + __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + y_aux = _mm_sign_epi8 (y, x); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (x_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, x); + imag_output = _mm_maddubs_epi16 (x_abs, y_aux); + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + minus128control = _mm_cmpeq_epi8 (y, minus128); + y = _mm_sub_epi8 (y, minus128control); + + + y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); + y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); + real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); + imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); + + real_output_i_1 = _mm_cvtepi16_epi32(real_output); + real_output = _mm_srli_si128 (real_output, 8); + real_output_i_2 = _mm_cvtepi16_epi32(real_output); + real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); + real_output_ps = _mm_cvtepi32_ps(real_output_i32); + + imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); + imag_output = _mm_srli_si128 (imag_output, 8); + imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); + imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); + imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + + +#ifdef LV_HAVE_GENERIC +#include +#include + +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + + + lv_16sc_t VE_code_value; + lv_16sc_t E_code_value; + lv_16sc_t P_code_value; + lv_16sc_t L_code_value; + lv_16sc_t VL_code_value; + lv_16sc_t bb_signal_sample; + + for(int i=0; i < num_points; ++i) + { + VE_code_value = VE_code[i]; + E_code_value = E_code[i]; + P_code_value = P_code[i]; + L_code_value = L_code[i]; + VL_code_value = VL_code[i]; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + if(lv_creal(VL_code_value) == -128) + { + VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); + } + if(lv_cimag(VL_code_value) == -128) + { + VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); + } +} + +#endif /* LV_HAVE_GENERIC */ + +//#ifdef LV_HAVE_GENERIC +//#include +//#include +//#include +// +//#ifndef MAX +//#define MAX(a,b) ((a) > (b) ? a : b) +//#endif +// +//#ifndef MIN +//#define MIN(a,b) ((a) < (b) ? a : b) +//#endif +// +///*! +// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +// \param input The input signal input +// \param carrier The carrier signal input +// \param VE_code Very Early PRN code replica input +// \param E_code Early PRN code replica input +// \param P_code Prompt PRN code replica input +// \param L_code Late PRN code replica input +// \param VL_code Very Late PRN code replica input +// \param VE_out Very Early correlation output +// \param E_out Early correlation output +// \param P_out Prompt correlation output +// \param L_out Late correlation output +// \param VL_out Very Late correlation output +// \param num_points The number of complex values in vectors +// */ +//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +//{ +// *VE_out = 0; +// *E_out = 0; +// *P_out = 0; +// *L_out = 0; +// *VL_out = 0; +// +// lv_16sc_t VE_out16; +// lv_16sc_t E_out16; +// lv_16sc_t P_out16; +// lv_16sc_t L_out16; +// lv_16sc_t VL_out16; +// +// int32_t max = 32767; +// int32_t min = -32768; +// +// int16_t real_real; +// int16_t imag_imag; +// int16_t real_imag; +// int16_t imag_real; +// int32_t out_real_32; +// int32_t out_imag_32; +// int16_t out_real_16; +// int16_t out_imag_16; +// int16_t aux1; +// int16_t aux2; +// +// +// lv_8sc_t bb_signal_sample = lv_cmake(0, 0); +// +// // perform very early, Early, Prompt, Late and very late correlation +// for(int i=0; i < num_points; ++i) +// { +// //Perform the carrier wipe-off +// bb_signal_sample = input[i] * carrier[i]; +// +// aux1 = (int16_t)lv_creal(bb_signal_sample); +// aux2 = (int16_t)lv_creal(VE_code[i]); +// real_real = aux1*aux2; +// aux1 = (int16_t)lv_cimag(bb_signal_sample); +// aux2 = (int16_t)lv_cimag(VE_code[i]); +// imag_imag = aux1*aux2; +// aux1 = (int16_t)lv_creal(bb_signal_sample); +// aux2 = (int16_t)lv_cimag(VE_code[i]); +// real_imag = aux1*aux2; +// aux1 = (int16_t)lv_cimag(bb_signal_sample); +// aux2 = (int16_t)lv_creal(VE_code[i]); +// imag_real = aux1*aux2; +// out_real_32 = (int32_t)real_real - (int32_t)imag_imag; +// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real; +// out_real_16 = MIN(MAX(out_real_32, min), max); +// out_imag_16 = MIN(MAX(out_imag_32, min), max); +// VE_out16 = lv_cmake(out_real_16, out_imag_16); +// +// +// +// if(lv_creal(L_code[i]) == -128) +// { +// int8_t* L_pointer = (int8_t*)&L_code[i]; +// *L_pointer = -127; +// } +// if(lv_cimag(L_code[i]) == -128) +// { +// int8_t* L_pointer = (int8_t*)&L_code[i]; +// L_pointer++; +// *L_pointer = -127; +// } +// aux1 = (int16_t)lv_creal(bb_signal_sample); +// aux2 = (int16_t)lv_creal(L_code[i]); +// real_real = aux1*aux2; +// aux1 = (int16_t)lv_cimag(bb_signal_sample); +// aux2 = (int16_t)lv_cimag(L_code[i]); +// imag_imag = aux1*aux2; +// aux1 = (int16_t)lv_creal(bb_signal_sample); +// aux2 = (int16_t)lv_cimag(L_code[i]); +// real_imag = aux1*aux2; +// aux1 = (int16_t)lv_cimag(bb_signal_sample); +// aux2 = (int16_t)lv_creal(L_code[i]); +// imag_real = aux1*aux2; +// out_real_32 = (int32_t)real_real - (int32_t)imag_imag; +// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real; +// out_real_16 = MIN(MAX(out_real_32, min), max); +// out_imag_16 = MIN(MAX(out_imag_32, min), max); +// L_out16 = lv_cmake(out_real_16, out_imag_16); +// +// E_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)E_code[i]; +// P_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)P_code[i]; +// VL_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)VL_code[i]; +// +// +// *VE_out += (lv_32fc_t) VE_out16; +// *E_out += (lv_32fc_t) E_out16; +// *P_out += (lv_32fc_t) P_out16; +// *L_out += (lv_32fc_t) L_out16; +// *VL_out += (lv_32fc_t) VL_out16; +// +// //error en la parte real de L con 32 muestras +// //*L_out = lv_cmake(12, 12); +// } +//} +// +//#endif /* LV_HAVE_GENERIC */ + +//#ifdef LV_HAVE_GENERIC +///*! +// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +// \param input The input signal input +// \param carrier The carrier signal input +// \param VE_code Very Early PRN code replica input +// \param E_code Early PRN code replica input +// \param P_code Prompt PRN code replica input +// \param L_code Late PRN code replica input +// \param VL_code Very Late PRN code replica input +// \param VE_out Very Early correlation output +// \param E_out Early correlation output +// \param P_out Prompt correlation output +// \param L_out Late correlation output +// \param VL_out Very Late correlation output +// \param num_points The number of complex values in vectors +// */ +//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +//{ +// lv_8sc_t bb_signal_sample; +// +// bb_signal_sample = lv_cmake(0, 0); +// +// *VE_out = 0; +// *E_out = 0; +// *P_out = 0; +// *L_out = 0; +// *VL_out = 0; +// // perform very early, Early, Prompt, Late and very late correlation +// for(int i=0; i < num_points; ++i) +// { +// //Perform the carrier wipe-off +// bb_signal_sample = input[i] * carrier[i]; +// +// *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); +// *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); +// *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); +// *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); +// *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); +// } +//} +// +//#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H */ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h new file mode 100644 index 000000000..ea2bf0824 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h @@ -0,0 +1,772 @@ +/*! + * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the + * real part and 8 bits the imaginary part), and accumulates the result + * in 32 bits single point values, returning float32 values: + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 16 bits vectors) It returns the input + * signal in base band (BB) + * - Very Early values are calculated by multiplying the input signal in BB by the + * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Very Late values are calculated by multiplying the input signal in BB by the + * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * + * ------------------------------------------------------------------------- + * Bits analysis + * + * input = 8 bits + * carrier = 8 bits + * XX_code = 8 bits + * XX_out16 = 16 bits + * bb_signal_sample = 8 bits + * + * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) + * + * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits) + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; + __m128i real_output, imag_output; + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + __m128i minus128control; + + __m128i minus128 = _mm_set1_epi8 (-128); + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); + __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + if(num_points%8!=0) + { + lv_16sc_t bb_signal_sample; + lv_16sc_t VE_code_value; + lv_16sc_t E_code_value; + lv_16sc_t P_code_value; + lv_16sc_t L_code_value; + lv_16sc_t VL_code_value; + + for(int i=0; i < num_points%8; ++i) + { + VE_code_value = *VE_code_ptr++; + E_code_value = *E_code_ptr++; + P_code_value = *P_code_ptr++; + L_code_value = *L_code_ptr++; + VL_code_value = *VL_code_ptr++; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); + } + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +#include +#include + +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + + lv_16sc_t VE_code_value; + lv_16sc_t E_code_value; + lv_16sc_t P_code_value; + lv_16sc_t L_code_value; + lv_16sc_t VL_code_value; + lv_16sc_t bb_signal_sample; + + for(int i=0; i < num_points; ++i) + { + VE_code_value = VE_code[i]; + E_code_value = E_code[i]; + P_code_value = P_code[i]; + L_code_value = L_code[i]; + VL_code_value = VL_code[i]; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + if(lv_creal(VL_code_value) == -128) + { + VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); + } + if(lv_cimag(VL_code_value) == -128) + { + VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; + __m128i real_output, imag_output; + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + __m128i minus128control; + + __m128i minus128 = _mm_set1_epi8 (-128); + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); + __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + if(num_points%8!=0) + { + lv_16sc_t bb_signal_sample; + lv_16sc_t VE_code_value; + lv_16sc_t E_code_value; + lv_16sc_t P_code_value; + lv_16sc_t L_code_value; + lv_16sc_t VL_code_value; + + for(int i=0; i < num_points%8; ++i) + { + VE_code_value = *VE_code_ptr++; + E_code_value = *E_code_ptr++; + P_code_value = *P_code_ptr++; + L_code_value = *L_code_ptr++; + VL_code_value = *VL_code_ptr++; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); + } + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +#include +#include + +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + + lv_16sc_t VE_code_value; + lv_16sc_t E_code_value; + lv_16sc_t P_code_value; + lv_16sc_t L_code_value; + lv_16sc_t VL_code_value; + lv_16sc_t bb_signal_sample; + + for(int i=0; i < num_points; ++i) + { + VE_code_value = VE_code[i]; + E_code_value = E_code[i]; + P_code_value = P_code[i]; + L_code_value = L_code[i]; + VL_code_value = VL_code[i]; + + if(lv_creal(VE_code_value) == -128) + { + VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); + } + if(lv_cimag(VE_code_value) == -128) + { + VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); + } + + if(lv_creal(E_code_value) == -128) + { + E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); + } + if(lv_cimag(E_code_value) == -128) + { + E_code_value = lv_cmake(lv_creal(E_code_value), -127); + } + + if(lv_creal(P_code_value) == -128) + { + P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); + } + if(lv_cimag(P_code_value) == -128) + { + P_code_value = lv_cmake(lv_creal(P_code_value), -127); + } + + if(lv_creal(L_code_value) == -128) + { + L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); + } + if(lv_cimag(L_code_value) == -128) + { + L_code_value = lv_cmake(lv_creal(L_code_value), -127); + } + + if(lv_creal(VL_code_value) == -128) + { + VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); + } + if(lv_cimag(VL_code_value) == -128) + { + VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); + } + + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h new file mode 100644 index 000000000..a1cbd0cb3 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h @@ -0,0 +1,554 @@ +/*! + * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h + * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that performs the carrier wipe-off mixing and the + * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the + * real part and 8 bits the imaginary part), and accumulates the result + * in 32 bits single point values, returning float32 values: + * - The carrier wipe-off is done by multiplying the input signal by the + * carrier (multiplication of 16 bits vectors) It returns the input + * signal in base band (BB) + * - Very Early values are calculated by multiplying the input signal in BB by the + * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Early values are calculated by multiplying the input signal in BB by the + * early code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Prompt values are calculated by multiplying the input signal in BB by the + * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Late values are calculated by multiplying the input signal in BB by the + * late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * - Very Late values are calculated by multiplying the input signal in BB by the + * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values + * + * ------------------------------------------------------------------------- + * Bits analysis + * + * input = 8 bits + * carrier = 8 bits + * XX_code = 8 bits + * XX_out16 = 16 bits + * bb_signal_sample = 8 bits + * + * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) + * + * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits) + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; + __m128i real_output, imag_output; + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); + __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_lddqu_si128((__m128i*)input_ptr); + y = _mm_lddqu_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_lddqu_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_lddqu_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_lddqu_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_lddqu_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_lddqu_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +#include +#include + +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + + lv_16sc_t bb_signal_sample; + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */ + + +#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H +#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H + +#include +#include +#include +#include +#include + +#ifdef LV_HAVE_SSE4_1 +#include "smmintrin.h" +#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +#include "CommonMacros/CommonMacros.h" +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + const unsigned int sse_iters = num_points / 8; + + __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; + __m128i real_output, imag_output; + __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; + __m128i input_i_1, input_i_2, output_i32; + __m128 real_output_ps, imag_output_ps; + + __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); + __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); + __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + + const lv_8sc_t* input_ptr = input; + const lv_8sc_t* carrier_ptr = carrier; + + const lv_8sc_t* VE_code_ptr = VE_code; + lv_32fc_t* VE_out_ptr = VE_out; + const lv_8sc_t* E_code_ptr = E_code; + lv_32fc_t* E_out_ptr = E_out; + const lv_8sc_t* P_code_ptr = P_code; + lv_32fc_t* P_out_ptr = P_out; + const lv_8sc_t* L_code_ptr = L_code; + lv_32fc_t* L_out_ptr = L_out; + const lv_8sc_t* VL_code_ptr = VL_code; + lv_32fc_t* VL_out_ptr = VL_out; + + float VE_out_real = 0; + float VE_out_imag = 0; + float E_out_real = 0; + float E_out_imag = 0; + float P_out_real = 0; + float P_out_imag = 0; + float L_out_real = 0; + float L_out_imag = 0; + float VL_out_real = 0; + float VL_out_imag = 0; + + real_VE_code_acc = _mm_setzero_ps(); + imag_VE_code_acc = _mm_setzero_ps(); + real_E_code_acc = _mm_setzero_ps(); + imag_E_code_acc = _mm_setzero_ps(); + real_P_code_acc = _mm_setzero_ps(); + imag_P_code_acc = _mm_setzero_ps(); + real_L_code_acc = _mm_setzero_ps(); + imag_L_code_acc = _mm_setzero_ps(); + real_VL_code_acc = _mm_setzero_ps(); + imag_VL_code_acc = _mm_setzero_ps(); + + if (sse_iters>0) + { + for(int number = 0;number < sse_iters; number++){ + + //Perform the carrier wipe-off + x = _mm_load_si128((__m128i*)input_ptr); + y = _mm_load_si128((__m128i*)carrier_ptr); + + x_abs = _mm_abs_epi8 (x); + + CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) + + imag_output = _mm_slli_si128 (imag_output, 1); + bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); + bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); + + //Get very early values + y = _mm_load_si128((__m128i*)VE_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); + imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); + + //Get early values + y = _mm_load_si128((__m128i*)E_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); + imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); + + //Get prompt values + y = _mm_load_si128((__m128i*)P_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); + imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); + + //Get late values + y = _mm_load_si128((__m128i*)L_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); + imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); + + //Get very late values + y = _mm_load_si128((__m128i*)VL_code_ptr); + + CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) + + real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); + imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); + + input_ptr += 8; + carrier_ptr += 8; + VE_code_ptr += 8; + E_code_ptr += 8; + P_code_ptr += 8; + L_code_ptr += 8; + VL_code_ptr += 8; + } + + __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; + + _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector + _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector + + for (int i = 0; i<4; ++i) + { + VE_out_real += real_VE_dotProductVector[i]; + VE_out_imag += imag_VE_dotProductVector[i]; + E_out_real += real_E_dotProductVector[i]; + E_out_imag += imag_E_dotProductVector[i]; + P_out_real += real_P_dotProductVector[i]; + P_out_imag += imag_P_dotProductVector[i]; + L_out_real += real_L_dotProductVector[i]; + L_out_imag += imag_L_dotProductVector[i]; + VL_out_real += real_VL_dotProductVector[i]; + VL_out_imag += imag_VL_dotProductVector[i]; + } + *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); + *E_out_ptr = lv_cmake(E_out_real, E_out_imag); + *P_out_ptr = lv_cmake(P_out_real, P_out_imag); + *L_out_ptr = lv_cmake(L_out_real, L_out_imag); + *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); + } + + lv_16sc_t bb_signal_sample; + for(int i=0; i < num_points%8; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); + // Now get very early, early, prompt, late and very late values for each + *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); + *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); + *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); + *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); + *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); + } +} +#endif /* LV_HAVE_SSE4_1 */ + +#ifdef LV_HAVE_GENERIC +#include +#include + +/*! + \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation + \param input The input signal input + \param carrier The carrier signal input + \param VE_code Very Early PRN code replica input + \param E_code Early PRN code replica input + \param P_code Prompt PRN code replica input + \param L_code Late PRN code replica input + \param VL_code Very Late PRN code replica input + \param VE_out Very Early correlation output + \param E_out Early correlation output + \param P_out Prompt correlation output + \param L_out Late correlation output + \param VL_out Very Late correlation output + \param num_points The number of complex values in vectors + */ +static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +{ + *VE_out = 0; + *E_out = 0; + *P_out = 0; + *L_out = 0; + *VL_out = 0; + + lv_16sc_t bb_signal_sample; + + for(int i=0; i < num_points; ++i) + { + //Perform the carrier wipe-off + bb_signal_sample = input[i] * carrier[i]; + // Now get very early, early, prompt, late and very late values for each + *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); + *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); + *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); + *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); + *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */ \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h new file mode 100644 index 000000000..9bb7c94e3 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h @@ -0,0 +1,210 @@ +/*! + * \file volk_gnsssdr_8u_x2_multiply_8u.h + * \brief Volk protokernel: multiplies unsigned char values + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that multiplies unsigned char values (8 bits data) + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H +#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H + +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include +/*! + \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char + \param cChar The unsigned char where the results will be stored + \param aChar One of the unsigned char to be multiplied + \param bChar One of the unsigned char to be multiplied + \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar + */ +static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 16; + + __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; + unsigned char* c = cChar; + const unsigned char* a = aChar; + const unsigned char* b = bChar; + + for(int number = 0;number < sse_iters; number++){ + x = _mm_lddqu_si128((__m128i*)a); + y = _mm_lddqu_si128((__m128i*)b); + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + x1 = _mm_srli_si128 (x, 1); + x1 = _mm_and_si128 (x1, mult1); + x2 = _mm_and_si128 (x, mult1); + + y1 = _mm_srli_si128 (y, 1); + y1 = _mm_and_si128 (y1, mult1); + y2 = _mm_and_si128 (y, mult1); + + x1_mult_y1 = _mm_mullo_epi16 (x1, y1); + x2_mult_y2 = _mm_mullo_epi16 (x2, y2); + + tmp = _mm_and_si128 (x1_mult_y1, mult1); + tmp1 = _mm_slli_si128 (tmp, 1); + tmp2 = _mm_and_si128 (x2_mult_y2, mult1); + totalc = _mm_or_si128 (tmp1, tmp2); + + _mm_storeu_si128((__m128i*)c, totalc); + + a += 16; + b += 16; + c += 16; + } + + for (int i = 0; i<(num_points % 16); ++i) + { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_SSE3 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char + \param cChar The unsigned char where the results will be stored + \param aChar One of the unsigned char to be multiplied + \param bChar One of the unsigned char to be multiplied + \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar + */ +static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ + unsigned char* cPtr = cChar; + const unsigned char* aPtr = aChar; + const unsigned char* bPtr = bChar; + + for(int number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H +#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H + +#include +#include + +#ifdef LV_HAVE_SSE3 +#include +#include +/*! + \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char + \param cChar The unsigned char where the results will be stored + \param aChar One of the unsigned char to be multiplied + \param bChar One of the unsigned char to be multiplied + \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar + */ +static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ + + const unsigned int sse_iters = num_points / 16; + + __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; + unsigned char* c = cChar; + const unsigned char* a = aChar; + const unsigned char* b = bChar; + + for(int number = 0;number < sse_iters; number++){ + x = _mm_load_si128((__m128i*)a); + y = _mm_load_si128((__m128i*)b); + + mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); + x1 = _mm_srli_si128 (x, 1); + x1 = _mm_and_si128 (x1, mult1); + x2 = _mm_and_si128 (x, mult1); + + y1 = _mm_srli_si128 (y, 1); + y1 = _mm_and_si128 (y1, mult1); + y2 = _mm_and_si128 (y, mult1); + + x1_mult_y1 = _mm_mullo_epi16 (x1, y1); + x2_mult_y2 = _mm_mullo_epi16 (x2, y2); + + tmp = _mm_and_si128 (x1_mult_y1, mult1); + tmp1 = _mm_slli_si128 (tmp, 1); + tmp2 = _mm_and_si128 (x2_mult_y2, mult1); + totalc = _mm_or_si128 (tmp1, tmp2); + + _mm_store_si128((__m128i*)c, totalc); + + a += 16; + b += 16; + c += 16; + } + + for (int i = 0; i<(num_points % 16); ++i) + { + *c++ = (*a++) * (*b++); + } +} +#endif /* LV_HAVE_SSE */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char + \param cChar The unsigned char where the results will be stored + \param aChar One of the unsigned char to be multiplied + \param bChar One of the unsigned char to be multiplied + \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar + */ +static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ + unsigned char* cPtr = cChar; + const unsigned char* aPtr = aChar; + const unsigned char* bPtr = bChar; + + for(int number = 0; number < num_points; number++){ + *cPtr++ = (*aPtr++) * (*bPtr++); + } +} +#endif /* LV_HAVE_GENERIC */ + +#ifdef LV_HAVE_ORC +/*! + \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char + \param cChar The unsigned char where the results will be stored + \param aChar One of the unsigned char to be multiplied + \param bChar One of the unsigned char to be multiplied + \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar + */ +extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points); +static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){ + volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points); +} +#endif /* LV_HAVE_ORC */ + +#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h new file mode 100644 index 000000000..756d2b544 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h @@ -0,0 +1,866 @@ +/*! + * \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc + * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. + * \authors
    + *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com + *
+ * + * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2007 Julien Pommier + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + *(this is the zlib license) + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2012 Giovanni Garberoglio + * Interdisciplinary Laboratory for Computational Science (LISC) + * Fondazione Bruno Kessler and University of Trento + * via Sommarive, 18 + * I-38123 Trento (Italy) + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H +#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ + +// float* pointer1 = (float*)&phase_rad_init; +// *pointer1 = 0; +// float* pointer2 = (float*)&phase_step_rad; +// *pointer2 = 0.5; + + const unsigned int sse_iters = num_points / 8; + + __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); + __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); + __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); + __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f); + __m128i _pi32avx_1 = _mm_set1_epi32(1); + __m128i _pi32avx_inv1 = _mm_set1_epi32(~1); + __m128i _pi32avx_2 = _mm_set1_epi32(2); + __m128i _pi32avx_4 = _mm_set1_epi32(4); + __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI + __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f); + __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f); + __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f); + __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f); + __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f); + __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); + __m256 _ps256_1 = _mm256_set1_ps(1.f); + __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); + + __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); + + __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; + __m256 xmm1, xmm2, xmm3, sign_bit_sin; + __m256i imm0, imm2, imm4; + __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; + __VOLK_ATTR_ALIGNED(32) float sin_value[8]; + __VOLK_ATTR_ALIGNED(32) float cos_value[8]; + + phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); + + for(int i = 0; i < sse_iters; i++) + { + + x = phase_rad_array; + + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); + + /* take the absolute value */ + x = _mm256_xor_ps(x, sign_bit_sin); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, _ps256_cephes_FOPI); + + /* we use SSE2 routines to perform the integer ops */ + + //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); + y = _mm256_cvttps_epi32(y); + imm2_1 = _mm256_extractf128_ps (y, 0); + imm2_2 = _mm256_extractf128_ps (y, 1); + + imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); + imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); + + imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); + imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); + + //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm2 = _mm256_set_m128i (imm2_2, imm2_1); + imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); + + y = _mm256_cvtepi32_ps(imm2); + + imm4_1 = imm2_1; + imm4_2 = imm2_2; + + imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); + imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); + + imm0_1 = _mm_slli_epi32(imm0_1, 29); + imm0_2 = _mm_slli_epi32(imm0_2, 29); + + //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm0 = _mm256_set_m128i (imm0_2, imm0_1); + imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); + + imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); + imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); + + imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); + imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); + + //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm2 = _mm256_set_m128i (imm2_2, imm2_1); + imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); + + swap_sign_bit_sin = _mm256_castsi256_ps(imm0); + poly_mask = _mm256_castsi256_ps(imm2); + + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = _ps256_minus_cephes_DP1; + xmm2 = _ps256_minus_cephes_DP2; + xmm3 = _ps256_minus_cephes_DP3; + xmm1 = _mm256_mul_ps(y, xmm1); + xmm2 = _mm256_mul_ps(y, xmm2); + xmm3 = _mm256_mul_ps(y, xmm3); + x = _mm256_add_ps(x, xmm1); + x = _mm256_add_ps(x, xmm2); + x = _mm256_add_ps(x, xmm3); + + imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); + imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); + + imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); + imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); + + imm4_1 = _mm_slli_epi32(imm4_1, 29); + imm4_2 = _mm_slli_epi32(imm4_2, 29); + + //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm4 = _mm256_set_m128i (imm4_2, imm4_1); + imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); + + sign_bit_cos = _mm256_castsi256_ps(imm4); + + sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + z = _mm256_mul_ps(x,x); + y = _ps256_coscof_p0; + + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, _ps256_coscof_p1); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, _ps256_coscof_p2); + y = _mm256_mul_ps(y, z); + y = _mm256_mul_ps(y, z); + tmp = _mm256_mul_ps(z, _ps256_0p5); + y = _mm256_sub_ps(y, tmp); + y = _mm256_add_ps(y, _ps256_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + y2 = _ps256_sincof_p0; + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, _ps256_sincof_p1); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, _ps256_sincof_p2); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_mul_ps(y2, x); + y2 = _mm256_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + ysin2 = _mm256_and_ps(xmm3, y2); + ysin1 = _mm256_andnot_ps(xmm3, y); + y2 = _mm256_sub_ps(y2,ysin2); + y = _mm256_sub_ps(y, ysin1); + + xmm1 = _mm256_add_ps(ysin1,ysin2); + xmm2 = _mm256_add_ps(y,y2); + + /* update the sign */ + s = _mm256_xor_ps(xmm1, sign_bit_sin); + c = _mm256_xor_ps(xmm2, sign_bit_cos); + + //GNSS-SDR needs to return -sin + s = _mm256_xor_ps(s, _ps256_sign_mask); + + _mm256_storeu_ps ((float*)sin_value, s); + _mm256_storeu_ps ((float*)cos_value, c); + + for(int i = 0; i < 8; i++) + { + d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); + } + d_carr_sign += 8; + + phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); + } + + if (num_points%8!=0) + { + __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; + _mm256_storeu_si256 ((float*)phase_rad_store, phase_rad_array); + + float phase_rad = phase_rad_store[0]; + + for(int i = 0; i < num_points%8; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } + } +} +#endif /* LV_HAVE_AVX */ + + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ + +// float* pointer1 = (float*)&phase_rad_init; +// *pointer1 = 0; +// float* pointer2 = (float*)&phase_step_rad; +// *pointer2 = 0.5; + + const unsigned int sse_iters = num_points / 4; + + __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); + __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); + __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); + __m128 _ps_sign_mask = _mm_set1_ps(-0.f); + __m128i _pi32_1 = _mm_set1_epi32(1); + __m128i _pi32_inv1 = _mm_set1_epi32(~1); + __m128i _pi32_2 = _mm_set1_epi32(2); + __m128i _pi32_4 = _mm_set1_epi32(4); + __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI + __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f); + __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f); + __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f); + __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f); + __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f); + __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); + __m128 _ps_1 = _mm_set1_ps(1.f); + __m128 _ps_0p5 = _mm_set1_ps(0.5f); + + __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); + + __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; + __m128 xmm1, xmm2, xmm3, sign_bit_sin; + __m128i emm0, emm2, emm4; + __VOLK_ATTR_ALIGNED(16) float sin_value[4]; + __VOLK_ATTR_ALIGNED(16) float cos_value[4]; + + phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); + + for(int i = 0; i < sse_iters; i++) + { + x = phase_rad_array; + + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); + + /* take the absolute value */ + x = _mm_xor_ps(x, sign_bit_sin); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, _ps_cephes_FOPI); + + /* store the integer part of y in emm2 */ + emm2 = _mm_cvttps_epi32(y); + + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, _pi32_1); + emm2 = _mm_and_si128(emm2, _pi32_inv1); + y = _mm_cvtepi32_ps(emm2); + + emm4 = emm2; + + /* get the swap sign flag for the sine */ + emm0 = _mm_and_si128(emm2, _pi32_4); + emm0 = _mm_slli_epi32(emm0, 29); + swap_sign_bit_sin = _mm_castsi128_ps(emm0); + + /* get the polynom selection mask for the sine*/ + emm2 = _mm_and_si128(emm2, _pi32_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + poly_mask = _mm_castsi128_ps(emm2); + + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); + xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); + xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); + x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); + + emm4 = _mm_sub_epi32(emm4, _pi32_2); + emm4 = _mm_andnot_si128(emm4, _pi32_4); + emm4 = _mm_slli_epi32(emm4, 29); + sign_bit_cos = _mm_castsi128_ps(emm4); + + sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + z = _mm_mul_ps(x,x); + y = _ps_coscof_p0; + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _ps_coscof_p1); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _ps_coscof_p2); + y = _mm_mul_ps(y, _mm_mul_ps(z, z)); + tmp = _mm_mul_ps(z, _ps_0p5); + y = _mm_sub_ps(y, tmp); + y = _mm_add_ps(y, _ps_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + y2 = _ps_sincof_p0; + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _ps_sincof_p1); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _ps_sincof_p2); + y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); + y2 = _mm_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + ysin2 = _mm_and_ps(xmm3, y2); + ysin1 = _mm_andnot_ps(xmm3, y); + y2 = _mm_sub_ps(y2,ysin2); + y = _mm_sub_ps(y, ysin1); + + xmm1 = _mm_add_ps(ysin1,ysin2); + xmm2 = _mm_add_ps(y,y2); + + /* update the sign */ + s = _mm_xor_ps(xmm1, sign_bit_sin); + c = _mm_xor_ps(xmm2, sign_bit_cos); + + //GNSS-SDR needs to return -sin + s = _mm_xor_ps(s, _ps_sign_mask); + + _mm_storeu_ps ((float*)sin_value, s); + _mm_storeu_ps ((float*)cos_value, c); + + for(int i = 0; i < 4; i++) + { + d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); + } + d_carr_sign += 4; + + phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); + } + + if (num_points%4!=0) + { + __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; + _mm_storeu_si128 ((__m128i*)phase_rad_store, phase_rad_array); + + float phase_rad = phase_rad_store[0]; + + for(int i = 0; i < num_points%4; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated +*/ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ + +// float* pointer1 = (float*)&phase_rad_init; +// *pointer1 = 0; +// float* pointer2 = (float*)&phase_step_rad; +// *pointer2 = 0.5; + + float phase_rad = phase_rad_init; + for(int i = 0; i < num_points; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */ + + +#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H +#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H + +#include +#include +#include + +#ifdef LV_HAVE_AVX +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ + + // float* pointer1 = (float*)&phase_rad_init; + // *pointer1 = 0; + // float* pointer2 = (float*)&phase_step_rad; + // *pointer2 = 0.5; + + const unsigned int sse_iters = num_points / 8; + + __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); + __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); + __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); + __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f); + __m128i _pi32avx_1 = _mm_set1_epi32(1); + __m128i _pi32avx_inv1 = _mm_set1_epi32(~1); + __m128i _pi32avx_2 = _mm_set1_epi32(2); + __m128i _pi32avx_4 = _mm_set1_epi32(4); + __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI + __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f); + __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f); + __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f); + __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f); + __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f); + __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); + __m256 _ps256_1 = _mm256_set1_ps(1.f); + __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); + + __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); + + __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; + __m256 xmm1, xmm2, xmm3, sign_bit_sin; + __m256i imm0, imm2, imm4; + __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; + __VOLK_ATTR_ALIGNED(32) float sin_value[8]; + __VOLK_ATTR_ALIGNED(32) float cos_value[8]; + + phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); + + for(int i = 0; i < sse_iters; i++) + { + + x = phase_rad_array; + + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); + + /* take the absolute value */ + x = _mm256_xor_ps(x, sign_bit_sin); + + /* scale by 4/Pi */ + y = _mm256_mul_ps(x, _ps256_cephes_FOPI); + + /* we use SSE2 routines to perform the integer ops */ + + //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); + y = _mm256_cvttps_epi32(y); + imm2_1 = _mm256_extractf128_ps (y, 0); + imm2_2 = _mm256_extractf128_ps (y, 1); + + imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); + imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); + + imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); + imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); + + //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm2 = _mm256_set_m128i (imm2_2, imm2_1); + imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); + + y = _mm256_cvtepi32_ps(imm2); + + imm4_1 = imm2_1; + imm4_2 = imm2_2; + + imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); + imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); + + imm0_1 = _mm_slli_epi32(imm0_1, 29); + imm0_2 = _mm_slli_epi32(imm0_2, 29); + + //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm0 = _mm256_set_m128i (imm0_2, imm0_1); + imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); + + imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); + imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); + + imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); + imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); + + //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm2 = _mm256_set_m128i (imm2_2, imm2_1); + imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); + + swap_sign_bit_sin = _mm256_castsi256_ps(imm0); + poly_mask = _mm256_castsi256_ps(imm2); + + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = _ps256_minus_cephes_DP1; + xmm2 = _ps256_minus_cephes_DP2; + xmm3 = _ps256_minus_cephes_DP3; + xmm1 = _mm256_mul_ps(y, xmm1); + xmm2 = _mm256_mul_ps(y, xmm2); + xmm3 = _mm256_mul_ps(y, xmm3); + x = _mm256_add_ps(x, xmm1); + x = _mm256_add_ps(x, xmm2); + x = _mm256_add_ps(x, xmm3); + + imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); + imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); + + imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); + imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); + + imm4_1 = _mm_slli_epi32(imm4_1, 29); + imm4_2 = _mm_slli_epi32(imm4_2, 29); + + //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); + //_mm256_set_m128i not defined in some versions of immintrin.h + //imm4 = _mm256_set_m128i (imm4_2, imm4_1); + imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); + + sign_bit_cos = _mm256_castsi256_ps(imm4); + + sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + z = _mm256_mul_ps(x,x); + y = _ps256_coscof_p0; + + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, _ps256_coscof_p1); + y = _mm256_mul_ps(y, z); + y = _mm256_add_ps(y, _ps256_coscof_p2); + y = _mm256_mul_ps(y, z); + y = _mm256_mul_ps(y, z); + tmp = _mm256_mul_ps(z, _ps256_0p5); + y = _mm256_sub_ps(y, tmp); + y = _mm256_add_ps(y, _ps256_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + + y2 = _ps256_sincof_p0; + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, _ps256_sincof_p1); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_add_ps(y2, _ps256_sincof_p2); + y2 = _mm256_mul_ps(y2, z); + y2 = _mm256_mul_ps(y2, x); + y2 = _mm256_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + ysin2 = _mm256_and_ps(xmm3, y2); + ysin1 = _mm256_andnot_ps(xmm3, y); + y2 = _mm256_sub_ps(y2,ysin2); + y = _mm256_sub_ps(y, ysin1); + + xmm1 = _mm256_add_ps(ysin1,ysin2); + xmm2 = _mm256_add_ps(y,y2); + + /* update the sign */ + s = _mm256_xor_ps(xmm1, sign_bit_sin); + c = _mm256_xor_ps(xmm2, sign_bit_cos); + + //GNSS-SDR needs to return -sin + s = _mm256_xor_ps(s, _ps256_sign_mask); + + _mm256_store_ps ((float*)sin_value, s); + _mm256_store_ps ((float*)cos_value, c); + + for(int i = 0; i < 8; i++) + { + d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); + } + d_carr_sign += 8; + + phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); + } + + if (num_points%8!=0) + { + __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; + _mm256_store_ps ((float*)phase_rad_store, phase_rad_array); + + float phase_rad = phase_rad_store[0]; + + for(int i = 0; i < num_points%8; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } + } +} +#endif /* LV_HAVE_AVX */ + +#ifdef LV_HAVE_SSE2 +#include +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ + +// float* pointer1 = (float*)&phase_rad_init; +// *pointer1 = 0; +// float* pointer2 = (float*)&phase_step_rad; +// *pointer2 = 0.5; + + const unsigned int sse_iters = num_points / 4; + + __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); + __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); + __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); + __m128 _ps_sign_mask = _mm_set1_ps(-0.f); + __m128i _pi32_1 = _mm_set1_epi32(1); + __m128i _pi32_inv1 = _mm_set1_epi32(~1); + __m128i _pi32_2 = _mm_set1_epi32(2); + __m128i _pi32_4 = _mm_set1_epi32(4); + __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI + __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f); + __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f); + __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f); + __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f); + __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f); + __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); + __m128 _ps_1 = _mm_set1_ps(1.f); + __m128 _ps_0p5 = _mm_set1_ps(0.5f); + + __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); + + __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; + __m128 xmm1, xmm2, xmm3, sign_bit_sin; + __m128i emm0, emm2, emm4; + __VOLK_ATTR_ALIGNED(16) float sin_value[4]; + __VOLK_ATTR_ALIGNED(16) float cos_value[4]; + + phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); + + for(int i = 0; i < sse_iters; i++) + { + x = phase_rad_array; + + /* extract the sign bit (upper one) */ + sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); + + /* take the absolute value */ + x = _mm_xor_ps(x, sign_bit_sin); + + /* scale by 4/Pi */ + y = _mm_mul_ps(x, _ps_cephes_FOPI); + + /* store the integer part of y in emm2 */ + emm2 = _mm_cvttps_epi32(y); + + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, _pi32_1); + emm2 = _mm_and_si128(emm2, _pi32_inv1); + y = _mm_cvtepi32_ps(emm2); + + emm4 = emm2; + + /* get the swap sign flag for the sine */ + emm0 = _mm_and_si128(emm2, _pi32_4); + emm0 = _mm_slli_epi32(emm0, 29); + swap_sign_bit_sin = _mm_castsi128_ps(emm0); + + /* get the polynom selection mask for the sine*/ + emm2 = _mm_and_si128(emm2, _pi32_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + poly_mask = _mm_castsi128_ps(emm2); + + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); + xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); + xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); + x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); + + emm4 = _mm_sub_epi32(emm4, _pi32_2); + emm4 = _mm_andnot_si128(emm4, _pi32_4); + emm4 = _mm_slli_epi32(emm4, 29); + sign_bit_cos = _mm_castsi128_ps(emm4); + + sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + z = _mm_mul_ps(x,x); + y = _ps_coscof_p0; + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _ps_coscof_p1); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, _ps_coscof_p2); + y = _mm_mul_ps(y, _mm_mul_ps(z, z)); + tmp = _mm_mul_ps(z, _ps_0p5); + y = _mm_sub_ps(y, tmp); + y = _mm_add_ps(y, _ps_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + y2 = _ps_sincof_p0; + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _ps_sincof_p1); + y2 = _mm_mul_ps(y2, z); + y2 = _mm_add_ps(y2, _ps_sincof_p2); + y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); + y2 = _mm_add_ps(y2, x); + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + ysin2 = _mm_and_ps(xmm3, y2); + ysin1 = _mm_andnot_ps(xmm3, y); + y2 = _mm_sub_ps(y2,ysin2); + y = _mm_sub_ps(y, ysin1); + + xmm1 = _mm_add_ps(ysin1,ysin2); + xmm2 = _mm_add_ps(y,y2); + + /* update the sign */ + s = _mm_xor_ps(xmm1, sign_bit_sin); + c = _mm_xor_ps(xmm2, sign_bit_cos); + + //GNSS-SDR needs to return -sin + s = _mm_xor_ps(s, _ps_sign_mask); + + _mm_store_ps ((float*)sin_value, s); + _mm_store_ps ((float*)cos_value, c); + + for(int i = 0; i < 4; i++) + { + d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); + } + d_carr_sign += 4; + + phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); + } + + if (num_points%4!=0) + { + __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; + _mm_store_si128 ((__m128i*)phase_rad_store, phase_rad_array); + + float phase_rad = phase_rad_store[0]; + + for(int i = 0; i < num_points%4; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } + } +} +#endif /* LV_HAVE_SSE2 */ + +#ifdef LV_HAVE_GENERIC +/*! + \brief Accumulates the values in the input buffer + \param result The accumulated result + \param inputBuffer The buffer of data to be accumulated + \param num_points The number of values in inputBuffer to be accumulated + */ +static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ + +// float* pointer1 = (float*)&phase_rad_init; +// *pointer1 = 0; +// float* pointer2 = (float*)&phase_step_rad; +// *pointer2 = 0.5; + + float phase_rad = phase_rad_init; + for(int i = 0; i < num_points; i++) + { + *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); + d_carr_sign++; + phase_rad += phase_step_rad; + } +} +#endif /* LV_HAVE_GENERIC */ +#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */ + diff --git a/src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt b/src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt new file mode 100644 index 000000000..04cb9ee05 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt @@ -0,0 +1,578 @@ +# +# Copyright 2011-2012,2014 Free Software Foundation, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# + +######################################################################## +# header file detection +######################################################################## +include(CheckIncludeFile) +CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H) +if(HAVE_CPUID_H) + add_definitions(-DHAVE_CPUID_H) +endif() + +CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H) +if(HAVE_INTRIN_H) + add_definitions(-DHAVE_INTRIN_H) +endif() + +CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H) +if(HAVE_FENV_H) + add_definitions(-DHAVE_FENV_H) +endif() + +CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H) +if(HAVE_DLFCN_H) + add_definitions(-DHAVE_DLFCN_H) + list(APPEND volk_gnsssdr_libraries ${CMAKE_DL_LIBS}) +endif() + +######################################################################## +# Setup the compiler name +######################################################################## +set(COMPILER_NAME ${CMAKE_C_COMPILER_ID}) +if(MSVC) #its not set otherwise + set(COMPILER_NAME MSVC) +endif() + +message(STATUS "Compiler name: ${COMPILER_NAME}") + +if(NOT DEFINED COMPILER_NAME) + message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.") +endif() + +######################################################################## +# Special clang flag so flag checks can fail +######################################################################## +if(COMPILER_NAME MATCHES "GNU") + include(CheckCXXCompilerFlag) + CHECK_CXX_COMPILER_FLAG("-Werror=unused-command-line-argument" HAVE_WERROR_UNUSED_CMD_LINE_ARG) + if(HAVE_WERROR_UNUSED_CMD_LINE_ARG) + set(VOLK_FLAG_CHECK_FLAGS "-Werror=unused-command-line-argument") + endif() +endif() + +######################################################################## +# check for posix_memalign, since some OSs do not internally define +# _XOPEN_SOURCE or _POSIX_C_SOURCE; they leave this to the user. +######################################################################## + +include(CheckFunctionExists) +CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN) + +if(HAVE_POSIX_MEMALIGN) + add_definitions(-DHAVE_POSIX_MEMALIGN) +endif(HAVE_POSIX_MEMALIGN) + +######################################################################## +# detect x86 flavor of CPU +######################################################################## +if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64)$") + message(STATUS "x86* CPU detected") + set(CPU_IS_x86 TRUE) +endif() + +######################################################################## +# determine passing architectures based on compile flag tests +######################################################################## +execute_process( + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_gnsssdr_compile_utils.py + --mode "arch_flags" --compiler "${COMPILER_NAME}" + OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE +) + +macro(check_arch arch_name) + set(flags ${ARGN}) + set(have_${arch_name} TRUE) + foreach(flag ${flags}) + include(CheckCXXCompilerFlag) + set(have_flag have${flag}) + execute_process( #make the have_flag have nice alphanum chars (just for looks/not necessary) + COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))" + OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(VOLK_FLAG_CHECK_FLAGS) + set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS}) + endif() + CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag}) + unset(CMAKE_REQUIRED_FLAGS) + if (NOT ${have_flag}) + set(have_${arch_name} FALSE) + endif() + endforeach() + if (have_${arch_name}) + list(APPEND available_archs ${arch_name}) + endif() +endmacro(check_arch) + +foreach(line ${arch_flag_lines}) + string(REGEX REPLACE "," ";" arch_flags ${line}) + check_arch(${arch_flags}) +endforeach(line) + +macro(OVERRULE_ARCH arch reason) + message(STATUS "${reason}, Overruled arch ${arch}") + list(REMOVE_ITEM available_archs ${arch}) +endmacro(OVERRULE_ARCH) + +######################################################################## +# eliminate AVX on if not on x86, or if the compiler does not accept +# the xgetbv instruction, or {if not cross-compiling and the xgetbv +# executable does not function correctly}. +######################################################################## +set(HAVE_XGETBV 0) +set(HAVE_AVX_CVTPI32_PS 0) +if(CPU_IS_x86) + # check to see if the compiler/linker works with xgetb instruction + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c "unsigned long long _xgetbv(unsigned int index) { unsigned int eax, edx; __asm__ __volatile__(\"xgetbv\" : \"=a\"(eax), \"=d\"(edx) : \"c\"(index)); return ((unsigned long long)edx << 32) | eax; } int main (void) { (void) _xgetbv(0); return (0); }") + execute_process(COMMAND ${CMAKE_C_COMPILER} -o + ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv + ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c + OUTPUT_QUIET ERROR_QUIET + RESULT_VARIABLE avx_compile_result) + if(NOT ${avx_compile_result} EQUAL 0) + OVERRULE_ARCH(avx "Compiler or linker missing xgetbv instruction") + elseif(NOT CROSSCOMPILE_MULTILIB) + execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv + OUTPUT_QUIET ERROR_QUIET + RESULT_VARIABLE avx_exe_result) + if(NOT ${avx_exe_result} EQUAL 0) + OVERRULE_ARCH(avx "CPU missing xgetbv") + else() + set(HAVE_XGETBV 1) + endif() + else() + # cross compiling and compiler/linker seems to work; assume working + set(HAVE_XGETBV 1) + endif() + file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv + ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c) + + ######################################################################### + # eliminate AVX if cvtpi32_ps intrinsic fails like some versions of clang + ######################################################################### + + # check to see if the compiler/linker works with cvtpi32_ps instrinsic when using AVX + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c "#include \nint main (void) {__m128 __a; __m64 __b; __m128 foo = _mm_cvtpi32_ps(__a, __b); return (0); }") + execute_process(COMMAND ${CMAKE_C_COMPILER} -mavx -o + ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps + ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c + OUTPUT_QUIET ERROR_QUIET + RESULT_VARIABLE avx_compile_result) + if(NOT ${avx_compile_result} EQUAL 0) + OVERRULE_ARCH(avx "Compiler missing cvtpi32_ps instrinsic") + elseif(NOT CROSSCOMPILE_MULTILIB) + execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps + OUTPUT_QUIET ERROR_QUIET + RESULT_VARIABLE avx_exe_result) + if(NOT ${avx_exe_result} EQUAL 0) + OVERRULE_ARCH(avx "CPU missing cvtpi32_ps") + else() + set(HAVE_AVX_CVTPI32_PS 1) + endif() + else() + set(HAVE_AVX_CVTPI32_PS 1) + endif() + file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps + ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c) + + # Disable SSE4a if Clang is less than version 3.2 + if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang") + # Figure out the version of Clang + if(CMAKE_VERSION VERSION_LESS "2.8.10") + # Exctract the Clang version from the --version string. + # In cmake 2.8.10, we can just use CMAKE_C_COMPILER_VERSION + # without having to go through these string manipulations + execute_process(COMMAND ${CMAKE_C_COMPILER} --version + OUTPUT_VARIABLE clang_version) + string(REGEX MATCH "[0-9].[0-9]" CMAKE_C_COMPILER_VERSION ${clang_version}) + endif(CMAKE_VERSION VERSION_LESS "2.8.10") + + if(CMAKE_C_COMPILER_VERSION VERSION_LESS "3.2") + OVERRULE_ARCH(sse4_a "Clang >= 3.2 required for SSE4a") + endif(CMAKE_C_COMPILER_VERSION VERSION_LESS "3.2") + endif("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang") + +endif(CPU_IS_x86) + +if(${HAVE_XGETBV}) + add_definitions(-DHAVE_XGETBV) +endif() + +if(${HAVE_AVX_CVTPI32_PS}) + add_definitions(-DHAVE_AVX_CVTPI32_PS) +endif() + +######################################################################## +# if the CPU is not x86, eliminate all Intel SIMD +######################################################################## + +if(NOT CPU_IS_x86) + OVERRULE_ARCH(3dnow "Architecture is not x86 or x86_64") + OVERRULE_ARCH(mmx "Architecture is not x86 or x86_64") + OVERRULE_ARCH(sse "Architecture is not x86 or x86_64") + OVERRULE_ARCH(sse2 "Architecture is not x86 or x86_64") + OVERRULE_ARCH(sse3 "Architecture is not x86 or x86_64") + OVERRULE_ARCH(ssse3 "Architecture is not x86 or x86_64") + OVERRULE_ARCH(sse4_a "Architecture is not x86 or x86_64") + OVERRULE_ARCH(sse4_1 "Architecture is not x86 or x86_64") + OVERRULE_ARCH(sse4_2 "Architecture is not x86 or x86_64") + OVERRULE_ARCH(avx "Architecture is not x86 or x86_64") +endif(NOT CPU_IS_x86) + +######################################################################## +# implement overruling in the ORC case, +# since ORC always passes flag detection +######################################################################## +if(NOT ORC_FOUND) + OVERRULE_ARCH(orc "ORC support not found") +endif() + +######################################################################## +# implement overruling in the non-multilib case +# this makes things work when both -m32 and -m64 pass +######################################################################## +if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86) + include(CheckTypeSize) + check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY) + if (${SIZEOF_CPU} EQUAL 64) + OVERRULE_ARCH(32 "CPU width is 64 bits") + endif() + if (${SIZEOF_CPU} EQUAL 32) + OVERRULE_ARCH(64 "CPU width is 32 bits") + endif() + + #MSVC 64 bit does not have MMX, overrule it + if (${SIZEOF_CPU} EQUAL 64 AND MSVC) + OVERRULE_ARCH(mmx "No MMX for Win64") + endif() + +endif() + +######################################################################## +# done overrules! print the result +######################################################################## +message(STATUS "Available architectures: ${available_archs}") + +######################################################################## +# determine available machines given the available architectures +######################################################################## +execute_process( + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_gnsssdr_compile_utils.py + --mode "machines" --archs "${available_archs}" + OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE +) + +######################################################################## +# Implement machine overruling for redundant machines: +# A machine is redundant when expansion rules occur, +# and the arch superset passes configuration checks. +# When this occurs, eliminate the redundant machines +# to avoid unnecessary compilation of subset machines. +######################################################################## +foreach(arch mmx orc 64 32) + foreach(machine_name ${available_machines}) + string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name}) + if (${machine_name} STREQUAL ${machine_name_no_arch}) + else() + list(REMOVE_ITEM available_machines ${machine_name_no_arch}) + endif() + endforeach(machine_name) +endforeach(arch) + +######################################################################## +# done overrules! print the result +######################################################################## +message(STATUS "Available machines: ${available_machines}") + +######################################################################## +# Create rules to run the volk_gnsssdr generator +######################################################################## + +#dependencies are all python, xml, and header implementation files +file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml) +file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py) +file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk_gnsssdr/*.h) + +macro(gen_template tmpl output) + list(APPEND volk_gnsssdr_gen_sources ${output}) + add_custom_command( + OUTPUT ${output} + DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl} + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_gnsssdr_tmpl_utils.py + --input ${tmpl} --output ${output} ${ARGN} + ) +endmacro(gen_template) + +make_directory(${CMAKE_BINARY_DIR}/include/volk_gnsssdr) + +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr.tmpl.h ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_gnsssdr.c) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_typedefs.tmpl.h ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_typedefs.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_cpu.tmpl.h ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_cpu.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_cpu.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_gnsssdr_cpu.c) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_config_fixed.tmpl.h ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_config_fixed.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_machines.tmpl.h ${CMAKE_BINARY_DIR}/lib/volk_gnsssdr_machines.h) +gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_machines.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_gnsssdr_machines.c) + +set(BASE_CFLAGS NONE) +STRING(TOUPPER ${CMAKE_BUILD_TYPE} CBTU) +MESSAGE(STATUS BUILT TYPE ${CBTU}) +MESSAGE(STATUS "Base cflags = ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS}") +set(COMPILER_INFO "") +IF(MSVC) + IF(MSVC90) #Visual Studio 9 + SET(cmake_c_compiler_version "Microsoft Visual Studio 9.0") + ELSE(MSVC10) #Visual Studio 10 + SET(cmake_c_compiler_version "Microsoft Visual Studio 10.0") + ELSE(MSVC11) #Visual Studio 11 + SET(cmake_c_compiler_version "Microsoft Visual Studio 11.0") + ELSE(MSVC12) #Visual Studio 12 + SET(cmake_c_compiler_version "Microsoft Visual Studio 12.0") + ENDIF() +ELSE() + execute_process(COMMAND ${CMAKE_C_COMPILER} --version + OUTPUT_VARIABLE cmake_c_compiler_version) +ENDIF(MSVC) +set(COMPILER_INFO "${CMAKE_C_COMPILER}:::${CMAKE_C_FLAGS_${GRCBTU}} ${CMAKE_C_FLAGS}\n${CMAKE_CXX_COMPILER}:::${CMAKE_CXX_FLAGS_${GRCBTU}} ${CMAKE_CXX_FLAGS}\n" ) + +foreach(machine_name ${available_machines}) + #generate machine source + set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr_machine_${machine_name}.c) + gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_machine_xxx.tmpl.c ${machine_source} ${machine_name}) + + #determine machine flags + execute_process( + COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B} + ${CMAKE_SOURCE_DIR}/gen/volk_gnsssdr_compile_utils.py + --mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}" + OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE + ) + MESSAGE(STATUS "BUILD INFO ::: ${machine_name} ::: ${COMPILER_NAME} ::: ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}") + set(COMPILER_INFO "${COMPILER_INFO}${machine_name}:::${COMPILER_NAME}:::${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}\n" ) + if(${machine_name}_flags) + set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}") + endif() + + #add to available machine defs + string(TOUPPER LV_MACHINE_${machine_name} machine_def) + list(APPEND machine_defs ${machine_def}) +endforeach(machine_name) + +# Convert to a C string to compile and display properly +string(STRIP "${cmake_c_compiler_version}" cmake_c_compiler_version) +string(STRIP ${COMPILER_INFO} COMPILER_INFO) +MESSAGE(STATUS "Compiler Version: ${cmake_c_compiler_version}") +string(REPLACE "\n" " \\n" cmake_c_compiler_version ${cmake_c_compiler_version}) +string(REPLACE "\n" " \\n" COMPILER_INFO ${COMPILER_INFO}) + +######################################################################## +# Set local include directories first +######################################################################## +include_directories( + ${CMAKE_BINARY_DIR}/include + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/kernels + ${CMAKE_CURRENT_BINARY_DIR} + ${CMAKE_CURRENT_SOURCE_DIR} +) + +######################################################################## +# Handle ASM support +# on by default, but let users turn it off +######################################################################## +if(${CMAKE_VERSION} VERSION_GREATER "2.8.9") + set(ASM_ARCHS_AVAILABLE "armv7") + + set(FULL_C_FLAGS "${CMAKE_C_FLAGS}" "${CMAKE_CXX_COMPILER_ARG1}") + + # sort through a list of all architectures we have ASM for + # if we find one that matches our current system architecture + # set up the assembler flags and include the source files + foreach(ARCH ${ASM_ARCHS_AVAILABLE}) + string(REGEX MATCH "${ARCH}" ASM_ARCH "${FULL_C_FLAGS}") + if( ASM_ARCH STREQUAL "armv7" ) + message(STATUS "---- Adding ASM files") # we always use ATT syntax + message(STATUS "-- Detected armv7 architecture; enabling ASM") + # setup architecture specific assembler flags + set(ARCH_ASM_FLAGS "-mfpu=neon -g") + # then add the files + include_directories(${CMAKE_SOURCE_DIR}/kernels/volk_gnsssdr/asm/neon) + file(GLOB asm_files ${CMAKE_SOURCE_DIR}/kernels/volk_gnsssdr/asm/neon/*.s) + foreach(asm_file ${asm_files}) + list(APPEND volk_gnsssdr_sources ${asm_file}) + message(STATUS "Adding source file: ${asm_file}") + endforeach(asm_file) + endif() + enable_language(ASM) + set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS}) + message(STATUS "c flags: ${FULL_C_FLAGS}") + message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}") + endforeach(ARCH) + +else(${CMAKE_VERSION} VERSION_GREATER "2.8.9") + message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.") + foreach(machine_name ${available_machines}) + string(REGEX MATCH "neon" NEON_MACHINE ${machine_name}) + if( NEON_MACHINE STREQUAL "neon") + message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support") + endif() + endforeach() +endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9") + +######################################################################## +# Handle orc support +######################################################################## +if(ORC_FOUND) + #setup orc library usage + include_directories(${ORC_INCLUDE_DIRS}) + link_directories(${ORC_LIBRARY_DIRS}) + list(APPEND volk_gnsssdr_libraries ${ORC_LIBRARIES}) + + #setup orc functions + file(GLOB orc_files ${CMAKE_SOURCE_DIR}/orc/*.orc) + foreach(orc_file ${orc_files}) + + #extract the name for the generated c source from the orc file + get_filename_component(orc_file_name_we ${orc_file} NAME_WE) + set(orcc_gen ${CMAKE_CURRENT_BINARY_DIR}/${orc_file_name_we}.c) + + #create a rule to generate the source and add to the list of sources + add_custom_command( + COMMAND ${ORCC_EXECUTABLE} --include math.h --implementation -o ${orcc_gen} ${orc_file} + DEPENDS ${orc_file} OUTPUT ${orcc_gen} + ) + list(APPEND volk_gnsssdr_sources ${orcc_gen}) + + endforeach(orc_file) +else() + message(STATUS "Did not find liborc and orcc, disabling orc support...") +endif() + + +######################################################################## +# Handle the generated constants +######################################################################## + +execute_process(COMMAND ${PYTHON_EXECUTABLE} -c + "import time;print time.strftime('%a, %d %b %Y %H:%M:%S', time.gmtime())" + OUTPUT_VARIABLE BUILD_DATE OUTPUT_STRIP_TRAILING_WHITESPACE +) +message(STATUS "Loading build date ${BUILD_DATE} into constants...") +message(STATUS "Loading version ${VERSION} into constants...") + +#double escape for windows backslash path separators +string(REPLACE "\\" "\\\\" prefix ${prefix}) + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/constants.c.in + ${CMAKE_CURRENT_BINARY_DIR}/constants.c +@ONLY) + +list(APPEND volk_gnsssdr_sources ${CMAKE_CURRENT_BINARY_DIR}/constants.c) + +######################################################################## +# Setup the volk_gnsssdr sources list and library +######################################################################## +if(NOT WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden") +endif() + +list(APPEND volk_gnsssdr_sources + ${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr_prefs.c + ${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr_rank_archs.c + ${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr_malloc.c + ${volk_gnsssdr_gen_sources} +) + +#set the machine definitions where applicable +set_source_files_properties( + ${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr.c + ${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr_machines.c +PROPERTIES COMPILE_DEFINITIONS "${machine_defs}") + +if(MSVC) + #add compatibility includes for stdint types + include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc) + add_definitions(-DHAVE_CONFIG_H) + #compile the sources as C++ due to the lack of complex.h under MSVC + set_source_files_properties(${volk_gnsssdr_sources} PROPERTIES LANGUAGE CXX) +endif() + +#create the volk_gnsssdr runtime library + +#MODIFICATIONS BY GNSS-SDR +file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) +file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt) + +#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) +add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc}) + +source_group("Kernels" FILES ${h_files}) +source_group("Common Macros" FILES ${CommonMacros}) +source_group("ORC Files" FILES ${orc}) +#END OF MODIFICATIONS + +target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries}) +set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER}) +set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS") + +install(TARGETS volk_gnsssdr + LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_runtime" # .so file + ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_devel" # .lib file + RUNTIME DESTINATION bin COMPONENT "volk_gnsssdr_runtime" # .dll file +) + +if(ENABLE_STATIC_LIBS) + add_library(volk_gnsssdr_static STATIC ${volk_gnsssdr_sources}) + + if(NOT WIN32) + set_target_properties(volk_gnsssdr_static + PROPERTIES OUTPUT_NAME volk_gnsssdr) + endif(NOT WIN32) + + install(TARGETS volk_gnsssdr_static + ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_devel" # .lib file + ) +endif(ENABLE_STATIC_LIBS) + +######################################################################## +# Build the QA test application +######################################################################## + + +if(Boost_FOUND) + + set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc PROPERTIES + COMPILE_DEFINITIONS "BOOST_TEST_DYN_LINK;BOOST_TEST_MAIN" + ) + + include_directories(${Boost_INCLUDE_DIRS}) + link_directories(${Boost_LIBRARY_DIRS}) + + add_executable(test_all + ${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc + ${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc + ) + target_link_libraries(test_all volk_gnsssdr ${Boost_LIBRARIES}) + add_test(qa_volk_gnsssdr_test_all test_all) + +endif(Boost_FOUND) diff --git a/src/algorithms/libs/volk_gnsssdr/lib/constants.c.in b/src/algorithms/libs/volk_gnsssdr/lib/constants.c.in new file mode 100644 index 000000000..2f5fdcc3d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/constants.c.in @@ -0,0 +1,63 @@ +/* -*- c++ -*- */ +/* + * Copyright 2013 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#if HAVE_CONFIG_H +#include +#endif + +#include + +char* +volk_gnsssdr_prefix() +{ + return "@prefix@"; +} + +char* +volk_gnsssdr_build_date() +{ + return "@BUILD_DATE@"; +} + +char* +volk_gnsssdr_version() +{ + return "@VERSION@"; +} + +char* +volk_gnsssdr_c_compiler() +{ + return "@cmake_c_compiler_version@"; +} + +char* +volk_gnsssdr_compiler_flags() +{ + return "@COMPILER_INFO@"; +} + +char* +volk_gnsssdr_available_machines() +{ + return "@available_machines@"; +} diff --git a/src/algorithms/libs/volk_gnsssdr/lib/gcc_x86_cpuid.h b/src/algorithms/libs/volk_gnsssdr/lib/gcc_x86_cpuid.h new file mode 100644 index 000000000..e0254f192 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/gcc_x86_cpuid.h @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc. + * + * This file is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 3, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Under Section 7 of GPL version 3, you are granted additional + * permissions described in the GCC Runtime Library Exception, version + * 3.1, as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License and + * a copy of the GCC Runtime Library Exception along with this program; + * see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + * . + */ + +/* %ecx */ +#define bit_SSE3 (1 << 0) +#define bit_PCLMUL (1 << 1) +#define bit_SSSE3 (1 << 9) +#define bit_FMA (1 << 12) +#define bit_CMPXCHG16B (1 << 13) +#define bit_SSE4_1 (1 << 19) +#define bit_SSE4_2 (1 << 20) +#define bit_MOVBE (1 << 22) +#define bit_POPCNT (1 << 23) +#define bit_AES (1 << 25) +#define bit_XSAVE (1 << 26) +#define bit_OSXSAVE (1 << 27) +#define bit_AVX (1 << 28) +#define bit_F16C (1 << 29) +#define bit_RDRND (1 << 30) + +/* %edx */ +#define bit_CMPXCHG8B (1 << 8) +#define bit_CMOV (1 << 15) +#define bit_MMX (1 << 23) +#define bit_FXSAVE (1 << 24) +#define bit_SSE (1 << 25) +#define bit_SSE2 (1 << 26) + +/* Extended Features */ +/* %ecx */ +#define bit_LAHF_LM (1 << 0) +#define bit_ABM (1 << 5) +#define bit_SSE4a (1 << 6) +#define bit_XOP (1 << 11) +#define bit_LWP (1 << 15) +#define bit_FMA4 (1 << 16) +#define bit_TBM (1 << 21) + +/* %edx */ +#define bit_MMXEXT (1 << 22) +#define bit_LM (1 << 29) +#define bit_3DNOWP (1 << 30) +#define bit_3DNOW (1 << 31) + +/* Extended Features (%eax == 7) */ +#define bit_FSGSBASE (1 << 0) +#define bit_BMI (1 << 3) + +#if defined(__i386__) && defined(__PIC__) +/* %ebx may be the PIC register. */ +#if __GNUC__ >= 3 +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchg{l}\t{%%}ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ +#define __cpuid(level, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("xchgl\t%%ebx, %1\n\t" \ + "cpuid\n\t" \ + "xchgl\t%%ebx, %1\n\t" \ + : "=a" (a), "=r" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif +#else +#define __cpuid(level, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level)) + +#define __cpuid_count(level, count, a, b, c, d) \ + __asm__ ("cpuid\n\t" \ + : "=a" (a), "=b" (b), "=c" (c), "=d" (d) \ + : "0" (level), "2" (count)) +#endif + +/* Return highest supported input value for cpuid instruction. ext can + be either 0x0 or 0x8000000 to return highest supported value for + basic or extended cpuid information. Function returns 0 if cpuid + is not supported or whatever cpuid returns in eax register. If sig + pointer is non-null, then first four bytes of the signature + (as found in ebx register) are returned in location pointed by sig. */ + +static __inline unsigned int +__get_cpuid_max (unsigned int __ext, unsigned int *__sig) +{ + unsigned int __eax, __ebx, __ecx, __edx; + +#ifndef __x86_64__ + /* See if we can use cpuid. On AMD64 we always can. */ +#if __GNUC__ >= 3 + __asm__ ("pushf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "mov{l}\t{%0, %1|%1, %0}\n\t" + "xor{l}\t{%2, %0|%0, %2}\n\t" + "push{l}\t%0\n\t" + "popf{l|d}\n\t" + "pushf{l|d}\n\t" + "pop{l}\t%0\n\t" + "popf{l|d}\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#else +/* Host GCCs older than 3.0 weren't supporting Intel asm syntax + nor alternatives in i386 code. */ + __asm__ ("pushfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "movl\t%0, %1\n\t" + "xorl\t%2, %0\n\t" + "pushl\t%0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl\t%0\n\t" + "popfl\n\t" + : "=&r" (__eax), "=&r" (__ebx) + : "i" (0x00200000)); +#endif + + if (!((__eax ^ __ebx) & 0x00200000)) + return 0; +#endif + + /* Host supports cpuid. Return highest supported cpuid input value. */ + __cpuid (__ext, __eax, __ebx, __ecx, __edx); + + if (__sig) + *__sig = __ebx; + + return __eax; +} + +/* Return cpuid data for requested cpuid level, as found in returned + eax, ebx, ecx and edx registers. The function checks if cpuid is + supported and returns 1 for valid cpuid information or 0 for + unsupported cpuid level. All pointers are required to be non-null. */ + +static __inline int +__get_cpuid (unsigned int __level, + unsigned int *__eax, unsigned int *__ebx, + unsigned int *__ecx, unsigned int *__edx) +{ + unsigned int __ext = __level & 0x80000000; + + if (__get_cpuid_max (__ext, 0) < __level) + return 0; + + __cpuid (__level, *__eax, *__ebx, *__ecx, *__edx); + return 1; +} diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_add_quad_aligned16.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_add_quad_aligned16.cc new file mode 100644 index 000000000..771c4a24a --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_add_quad_aligned16.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_add_quad_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + + + +void qa_16s_add_quad_aligned16::t1() { + + volk_gnsssdr_environment_init(); + clock_t start, end; + double total; + const int vlen = 3200; + const int ITERS = 100000; + __VOLK_ATTR_ALIGNED(16) short input0[vlen]; + __VOLK_ATTR_ALIGNED(16) short input1[vlen]; + __VOLK_ATTR_ALIGNED(16) short input2[vlen]; + __VOLK_ATTR_ALIGNED(16) short input3[vlen]; + __VOLK_ATTR_ALIGNED(16) short input4[vlen]; + + __VOLK_ATTR_ALIGNED(16) short output0[vlen]; + __VOLK_ATTR_ALIGNED(16) short output1[vlen]; + __VOLK_ATTR_ALIGNED(16) short output2[vlen]; + __VOLK_ATTR_ALIGNED(16) short output3[vlen]; + __VOLK_ATTR_ALIGNED(16) short output01[vlen]; + __VOLK_ATTR_ALIGNED(16) short output11[vlen]; + __VOLK_ATTR_ALIGNED(16) short output21[vlen]; + __VOLK_ATTR_ALIGNED(16) short output31[vlen]; + + for(int i = 0; i < vlen; ++i) { + short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2; + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + input4[i] = plus4 - minus4; + + } + printf("16s_add_quad_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_gnsssdr_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_gnsssdr_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse2_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]); + CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]); + CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]); + CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]); + } +} + +#endif diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_add_quad_aligned16.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_add_quad_aligned16.h new file mode 100644 index 000000000..3c1ae978b --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_add_quad_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H +#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H + +#include +#include + +class qa_16s_add_quad_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_branch_4_state_8_aligned16.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_branch_4_state_8_aligned16.cc new file mode 100644 index 000000000..c11a3a203 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_branch_4_state_8_aligned16.cc @@ -0,0 +1,106 @@ +#include +#include +#include +#include + +//test for ssse3 + +#ifndef LV_HAVE_SSSE3 + +void qa_16s_branch_4_state_8_aligned16::t1() { + printf("ssse3 not available... no test performed\n"); +} + +#else + +void qa_16s_branch_4_state_8_aligned16::t1() { + const int num_iters = 1000000; + const int vlen = 32; + + static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03}; + static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01}; + static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f}; + static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d}; + static char* permuters[4] = {permute0, permute1, permute2, permute3}; + + unsigned int num_bytes = vlen << 1; + + volk_gnsssdr_environment_init(); + clock_t start, end; + double total; + + __VOLK_ATTR_ALIGNED(16) short target[vlen]; + __VOLK_ATTR_ALIGNED(16) short target2[vlen]; + __VOLK_ATTR_ALIGNED(16) short target3[vlen]; + + __VOLK_ATTR_ALIGNED(16) short src0[vlen]; + __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen] = { +7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 }; + __VOLK_ATTR_ALIGNED(16) short cntl0[vlen] = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + __VOLK_ATTR_ALIGNED(16) short cntl1[vlen] = { + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; + __VOLK_ATTR_ALIGNED(16) short cntl2[vlen] = { + 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 }; + __VOLK_ATTR_ALIGNED(16) short cntl3[vlen] = { + 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff }; + __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4}; + + + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + + } + + + printf("16s_branch_4_state_8_aligned\n"); + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_gnsssdr_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time: %f\n", total); + + + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_gnsssdr_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("branch_4_state_8_time, ssse3: %f\n", total); + + start = clock(); + for(int i = 0; i < num_iters; ++i) { + volk_gnsssdr_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("permute_and_scalar_add_time, generic: %f\n", total); + + + + for(int i = 0; i < vlen; ++i) { + printf("psa... %d, b4s8... %d\n", target[i], target3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + CPPUNIT_ASSERT(target[i] == target3[i]); + } +} + + +#endif diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_branch_4_state_8_aligned16.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_branch_4_state_8_aligned16.h new file mode 100644 index 000000000..41ab073e0 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_branch_4_state_8_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H +#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H + +#include +#include + +class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_permute_and_scalar_add_aligned16.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_permute_and_scalar_add_aligned16.cc new file mode 100644 index 000000000..74482c490 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_permute_and_scalar_add_aligned16.cc @@ -0,0 +1,78 @@ +#include +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_permute_and_scalar_add_aligned16::t1() { + const int vlen = 64; + + unsigned int num_bytes = vlen << 1; + + volk_gnsssdr_environment_init(); + clock_t start, end; + double total; + + __VOLK_ATTR_ALIGNED(16) short target[vlen]; + __VOLK_ATTR_ALIGNED(16) short target2[vlen]; + __VOLK_ATTR_ALIGNED(16) short src0[vlen]; + __VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen]; + __VOLK_ATTR_ALIGNED(16) short cntl0[vlen]; + __VOLK_ATTR_ALIGNED(16) short cntl1[vlen]; + __VOLK_ATTR_ALIGNED(16) short cntl2[vlen]; + __VOLK_ATTR_ALIGNED(16) short cntl3[vlen]; + __VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4}; + + for(int i = 0; i < vlen; ++i) { + src0[i] = i; + permute_indexes[i] = (3 * i)%vlen; + cntl0[i] = 0xff; + cntl1[i] = 0xff * (i%2); + cntl2[i] = 0xff * ((i>>1)%2); + cntl3[i] = 0xff * ((i%4) == 3); + } + + printf("16s_permute_and_scalar_add_aligned\n"); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_gnsssdr_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("generic_time: %f\n", total); + + start = clock(); + for(int i = 0; i < 100000; ++i) { + volk_gnsssdr_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2"); + } + end = clock(); + + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + + printf("sse2_time: %f\n", total); + + + for(int i = 0; i < vlen; ++i) { + //printf("generic... %d, sse2... %d\n", target[i], target2[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT(target[i] == target2[i]); + } +} + +#endif diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_permute_and_scalar_add_aligned16.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_permute_and_scalar_add_aligned16.h new file mode 100644 index 000000000..3643aeef6 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_permute_and_scalar_add_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H +#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H + +#include +#include + +class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_quad_max_star_aligned16.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_quad_max_star_aligned16.cc new file mode 100644 index 000000000..d3cd803e6 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_quad_max_star_aligned16.cc @@ -0,0 +1,60 @@ +#include +#include +#include +#include +#include + +//test for sse2 + +#ifndef LV_HAVE_SSE2 + +void qa_16s_quad_max_star_aligned16::t1() { + printf("sse2 not available... no test performed\n"); +} + +#else + +void qa_16s_quad_max_star_aligned16::t1() { + const int vlen = 34; + + __VOLK_ATTR_ALIGNED(16) short input0[vlen]; + __VOLK_ATTR_ALIGNED(16) short input1[vlen]; + __VOLK_ATTR_ALIGNED(16) short input2[vlen]; + __VOLK_ATTR_ALIGNED(16) short input3[vlen]; + + __VOLK_ATTR_ALIGNED(16) short output0[vlen]; + __VOLK_ATTR_ALIGNED(16) short output1[vlen]; + + for(int i = 0; i < vlen; ++i) { + short plus0 = (short) (rand() - (RAND_MAX/2)); + short plus1 = (short) (rand() - (RAND_MAX/2)); + short plus2 = (short) (rand() - (RAND_MAX/2)); + short plus3 = (short) (rand() - (RAND_MAX/2)); + + short minus0 = (short) (rand() - (RAND_MAX/2)); + short minus1 = (short) (rand() - (RAND_MAX/2)); + short minus2 = (short) (rand() - (RAND_MAX/2)); + short minus3 = (short) (rand() - (RAND_MAX/2)); + + input0[i] = plus0 - minus0; + input1[i] = plus1 - minus1; + input2[i] = plus2 - minus2; + input3[i] = plus3 - minus3; + } + + volk_gnsssdr_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic"); + + volk_gnsssdr_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2"); + + printf("16s_quad_max_star_aligned\n"); + for(int i = 0; i < vlen; ++i) { + printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]); + } + + for(int i = 0; i < vlen; ++i) { + + CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]); + } +} + +#endif diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_quad_max_star_aligned16.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_quad_max_star_aligned16.h new file mode 100644 index 000000000..51e77081a --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_16s_quad_max_star_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H +#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H + +#include +#include + +class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_fm_detect_aligned16.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_fm_detect_aligned16.cc new file mode 100644 index 000000000..6c30de171 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_fm_detect_aligned16.cc @@ -0,0 +1,61 @@ +#include +#include +#include +#include +#include + +//test for sse + +#ifndef LV_HAVE_SSE + +void qa_32f_fm_detect_aligned16::t1() { + printf("sse not available... no test performed\n"); +} + +#else + +void qa_32f_fm_detect_aligned16::t1() { + + volk_gnsssdr_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + __VOLK_ATTR_ALIGNED(16) float input0[vlen]; + + __VOLK_ATTR_ALIGNED(16) float output0[vlen]; + __VOLK_ATTR_ALIGNED(16) float output01[vlen]; + + for(int i = 0; i < vlen; ++i) { + input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2)); + } + printf("32f_fm_detect_aligned\n"); + + start = clock(); + float save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_gnsssdr_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + save = 0.1; + for(int count = 0; count < ITERS; ++count) { + volk_gnsssdr_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse_time: %f\n", total); + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4); + } +} + +#endif diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_fm_detect_aligned16.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_fm_detect_aligned16.h new file mode 100644 index 000000000..a2680c524 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_fm_detect_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H +#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H + +#include +#include + +class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_index_max_aligned16.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_index_max_aligned16.cc new file mode 100644 index 000000000..99ea2bc5d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_index_max_aligned16.cc @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3097 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE + +void qa_32f_index_max_aligned16::t1(){ + printf("sse not available... no test performed\n"); +} + +#else + + +void qa_32f_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + + volk_gnsssdr_runtime_init(); + + volk_gnsssdr_environment_init(); + int ret; + + unsigned int* target_sse4_1; + unsigned int* target_sse; + unsigned int* target_generic; + float* src0 ; + + + unsigned int i_target_sse4_1; + target_sse4_1 = &i_target_sse4_1; + unsigned int i_target_sse; + target_sse = &i_target_sse; + unsigned int i_target_generic; + target_generic = &i_target_generic; + + ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float)); + + random_floats((float*)src0, vlen); + + printf("32f_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_gnsssdr_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_gnsssdr_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + get_volk_gnsssdr_runtime()->volk_gnsssdr_32f_index_max_aligned16(target_sse4_1, src0, vlen); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse4.1 time: %f\n", total); + + + printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]); + CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]); + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_index_max_aligned16.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_index_max_aligned16.h new file mode 100644 index 000000000..8cadffa47 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_32f_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H + +#include +#include + +class qa_32f_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_index_max_aligned16.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_index_max_aligned16.cc new file mode 100644 index 000000000..aa5f7165d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_index_max_aligned16.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include +#include + +#define ERR_DELTA (1e-4) +#define NUM_ITERS 1000000 +#define VEC_LEN 3096 +static float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +static void +random_floats (float *buf, unsigned n) +{ + unsigned int i = 0; + for (; i < n; i++) { + + buf[i] = uniform () * 32767; + + } +} + + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_index_max_aligned16::t1(){ + printf("sse3 not available... no test performed\n"); +} + +#else + + +void qa_32fc_index_max_aligned16::t1(){ + + const int vlen = VEC_LEN; + + volk_gnsssdr_environment_init(); + int ret; + + unsigned int* target; + unsigned int* target_generic; + std::complex* src0 ; + + + unsigned int i_target; + target = &i_target; + unsigned int i_target_generic; + target_generic = &i_target_generic; + ret = posix_memalign((void**)&src0, 16, vlen << 3); + + random_floats((float*)src0, vlen * 2); + + printf("32fc_index_max_aligned16\n"); + + clock_t start, end; + double total; + + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_gnsssdr_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic time: %f\n", total); + + start = clock(); + for(int k = 0; k < NUM_ITERS; ++k) { + volk_gnsssdr_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3"); + } + + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3 time: %f\n", total); + + + + + printf("generic: %u, sse3: %u\n", target_generic[0], target[0]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1); + + + + free(src0); +} + +#endif /*LV_HAVE_SSE3*/ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_index_max_aligned16.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_index_max_aligned16.h new file mode 100644 index 000000000..0990bcb1f --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_index_max_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H +#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H + +#include +#include + +class qa_32fc_index_max_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_power_spectral_density_32f_aligned16.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_power_spectral_density_32f_aligned16.cc new file mode 100644 index 000000000..9467ff973 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_power_spectral_density_32f_aligned16.cc @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include + +//test for sse3 + +#ifndef LV_HAVE_SSE3 + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + printf("sse3 not available... no test performed\n"); +} + +#else + +void qa_32fc_power_spectral_density_32f_aligned16::t1() { + + volk_gnsssdr_environment_init(); + clock_t start, end; + double total; + const int vlen = 3201; + const int ITERS = 10000; + __VOLK_ATTR_ALIGNED(16) std::complex input0[vlen]; + + __VOLK_ATTR_ALIGNED(16) float output_generic[vlen]; + __VOLK_ATTR_ALIGNED(16) float output_sse3[vlen]; + + const float scalar = vlen; + const float rbw = 1.7; + + float* inputLoad = (float*)input0; + for(int i = 0; i < 2*vlen; ++i) { + inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast((RAND_MAX/2))); + } + printf("32fc_power_spectral_density_32f_aligned\n"); + + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_gnsssdr_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("generic_time: %f\n", total); + start = clock(); + for(int count = 0; count < ITERS; ++count) { + volk_gnsssdr_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3"); + } + end = clock(); + total = (double)(end-start)/(double)CLOCKS_PER_SEC; + printf("sse3_time: %f\n", total); + + for(int i = 0; i < 1; ++i) { + //printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]); + //printf("generic... %d, ssse3... %d\n", output0[i], output1[i]); + } + + for(int i = 0; i < vlen; ++i) { + //printf("%d...%d\n", output0[i], output01[i]); + CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4)); + } +} + +#endif diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_power_spectral_density_32f_aligned16.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_power_spectral_density_32f_aligned16.h new file mode 100644 index 000000000..26f430bec --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_32fc_power_spectral_density_32f_aligned16.h @@ -0,0 +1,18 @@ +#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H +#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H + +#include +#include + +class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase { + + CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16); + CPPUNIT_TEST (t1); + CPPUNIT_TEST_SUITE_END (); + + private: + void t1 (); +}; + + +#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */ diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_utils.cc b/src/algorithms/libs/volk_gnsssdr/lib/qa_utils.cc new file mode 100644 index 000000000..14a9de7ef --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_utils.cc @@ -0,0 +1,720 @@ +#include "qa_utils.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +float uniform() { + return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1) +} + +template +void random_floats (t *buf, unsigned n) +{ + for (unsigned i = 0; i < n; i++) + buf[i] = uniform (); +} + +void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n) { + if(type.is_complex) n *= 2; + if(type.is_float) { + if(type.size == 8) random_floats((double *)data, n); + else random_floats((float *)data, n); + } else { + float int_max = float(uint64_t(2) << (type.size*8)); + if(type.is_signed) int_max /= 2.0; + for(unsigned int i=0; i((RAND_MAX/2))) * int_max; + //man i really don't know how to do this in a more clever way, you have to cast down at some point + switch(type.size) { + case 8: + if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand; + else ((uint64_t *)data)[i] = (uint64_t) scaled_rand; + break; + case 4: + if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; + else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; + break; + case 2: + if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand; + else ((uint16_t *)data)[i] = (uint16_t) scaled_rand; + break; + case 1: + if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand; + else ((uint8_t *)data)[i] = (uint8_t) scaled_rand; + break; + default: + throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here + } + } + } +} + +static std::vector get_arch_list(volk_gnsssdr_func_desc_t desc) { + std::vector archlist; + + for(size_t i = 0; i < desc.n_impls; i++) { + //if(!(archs[i+1] & volk_gnsssdr_get_lvarch())) continue; //this arch isn't available on this pc + archlist.push_back(std::string(desc.impl_names[i])); + } + + return archlist; +} + +volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) { + volk_gnsssdr_type_t type; + type.is_float = false; + type.is_scalar = false; + type.is_complex = false; + type.is_signed = false; + type.size = 0; + type.str = name; + + if(name.size() < 2) throw std::string("name too short to be a datatype"); + + //is it a scalar? + if(name[0] == 's') { + type.is_scalar = true; + name = name.substr(1, name.size()-1); + } + + //get the data size + size_t last_size_pos = name.find_last_of("0123456789"); + if(last_size_pos == std::string::npos) + throw std::string("no size spec in type ").append(name); + //will throw if malformed + int size = boost::lexical_cast(name.substr(0, last_size_pos+1)); + + assert(((size % 8) == 0) && (size <= 64) && (size != 0)); + type.size = size/8; //in bytes + + for(size_t i=last_size_pos+1; i < name.size(); i++) { + switch (name[i]) { + case 'f': + type.is_float = true; + break; + case 'i': + type.is_signed = true; + break; + case 'c': + type.is_complex = true; + break; + case 'u': + type.is_signed = false; + break; + default: + throw; + } + } + + return type; +} + +static void get_signatures_from_name(std::vector &inputsig, + std::vector &outputsig, + std::string name) { + boost::char_separator sep("_"); + boost::tokenizer > tok(name, sep); + std::vector toked; + tok.assign(name); + toked.assign(tok.begin(), tok.end()); + assert(toked[0] == "volk"); + toked.erase(toked.begin()); + toked.erase(toked.begin()); + + //ok. we're assuming a string in the form + //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) + + enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; + std::string fn_name; + volk_gnsssdr_type_t type; + BOOST_FOREACH(std::string token, toked) { + try { + type = volk_gnsssdr_type_from_string(token); + if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... + + if(side == SIDE_INPUT) inputsig.push_back(type); + else outputsig.push_back(type); + } catch (...){ + if(token[0] == 'x') { //it's a multiplier + if(side == SIDE_INPUT) assert(inputsig.size() > 0); + else assert(outputsig.size() > 0); + int multiplier = boost::lexical_cast(token.substr(1, token.size()-1)); //will throw if invalid + for(int i=1; i &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], vlen, arch.c_str()); +} + +inline void run_cast_test2(volk_gnsssdr_fn_2arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); +} + +inline void run_cast_test3(volk_gnsssdr_fn_3arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); +} + +inline void run_cast_test4(volk_gnsssdr_fn_4arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); +} + +inline void run_cast_test1_s32f(volk_gnsssdr_fn_1arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s32f(volk_gnsssdr_fn_2arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test3_s32f(volk_gnsssdr_fn_3arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test1_s32fc(volk_gnsssdr_fn_1arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s32fc(volk_gnsssdr_fn_2arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test3_s32fc(volk_gnsssdr_fn_3arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +} + +//ADDED BY GNSS-SDR. START +inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str()); +} + +inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str()); +} + +inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +} + +inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { + while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +} +//ADDED BY GNSS-SDR. END + +// This function is a nop that helps resolve GNU Radio bugs 582 and 583. +// Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast(float tol) +// won't happen on armhf (reported on cortex A9 and A15). +void lv_force_cast_hf( int tol_i, float tol_f) +{ + int diff_i = 1; + float diff_f = 1; + if( diff_i > tol_i ) + std::cout << "" ; + if( diff_f > tol_f ) + std::cout << "" ; +} + +template +bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { + bool fail = false; + int print_max_errs = 10; + for(unsigned int i=0; i tol ) + { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl; + } + } + } + // the primary test is the percent different greater than given tol + else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl; + } + } + } + + return fail; +} + +template +bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) { + bool fail = false; + int print_max_errs = 10; + for(unsigned int i=0; i<2*vlen; i+=2) { + t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] }; + t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); + t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]); + + // for very small numbers we'll see round off errors due to limited + // precision. So a special test case... + if (norm < 1e-30) { + if (err > tol) + { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j" << std::endl; + } + } + } + // the primary test is the percent different greater than given tol + else if((err / norm) > tol) { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j" << std::endl; + } + } + } + + return fail; +} + +template +bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { + bool fail = false; + int print_max_errs = 10; + for(unsigned int i=0; i tol) { + fail=true; + if(print_max_errs-- > 0) { + std::cout << "offset " << i << " in1: " << static_cast(t(((t *)(in1))[i])) << " in2: " << static_cast(t(((t *)(in2))[i])) << std::endl; + } + } + } + + return fail; +} + +class volk_gnsssdr_qa_aligned_mem_pool{ +public: + void *get_new(size_t size){ + size_t alignment = volk_gnsssdr_get_alignment(); + void* ptr = volk_gnsssdr_malloc(size, alignment); + memset(ptr, 0x00, size); + _mems.push_back(ptr); + return ptr; + } + ~volk_gnsssdr_qa_aligned_mem_pool() { + for(unsigned int ii = 0; ii < _mems.size(); ++ii) { + volk_gnsssdr_free(_mems[ii]); + } + } +private: std::vector _mems; +}; + +bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, + void (*manual_func)(), + std::string name, + float tol, + lv_32fc_t scalar, + int vlen, + int iter, + std::vector *results, + std::string puppet_master_name, + bool benchmark_mode, + std::string kernel_regex + ) { + boost::xpressive::sregex kernel_expression = boost::xpressive::sregex::compile(kernel_regex); + if( !boost::xpressive::regex_search(name, kernel_expression) ) { + // in this case we have a regex and are only looking to test one kernel + return false; + } + if(results) { + results->push_back(volk_gnsssdr_test_results_t()); + results->back().name = name; + results->back().vlen = vlen; + results->back().iter = iter; + } + std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl; + + // The multiply and lv_force_cast_hf are work arounds for GNU Radio bugs 582 and 583 + // The bug is the casting/assignment below do not happen, which results in false + // positives when testing for errors in fcompare and icompare. + // Since this only happens on armhf (reported for Cortex A9 and A15) combined with + // the following fixes it is suspected to be a compiler bug. + // Bug 1272024 on launchpad has been filed with Linaro GCC. + const float tol_f = tol*1.0000001; + const unsigned int tol_i = static_cast(tol); + lv_force_cast_hf( tol_i, tol_f ); + + //first let's get a list of available architectures for the test + std::vector arch_list = get_arch_list(desc); + + if((!benchmark_mode) && (arch_list.size() < 2)) { + std::cout << "no architectures to test" << std::endl; + return false; + } + + //something that can hang onto memory and cleanup when this function exits + volk_gnsssdr_qa_aligned_mem_pool mem_pool; + + //now we have to get a function signature by parsing the name + std::vector inputsig, outputsig; + get_signatures_from_name(inputsig, outputsig, name); + + //pull the input scalars into their own vector + std::vector inputsc; + for(size_t i=0; i inbuffs; + BOOST_FOREACH(volk_gnsssdr_type_t sig, inputsig) { + if(!sig.is_scalar) //we don't make buffers for scalars + inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1))); + } + for(size_t i=0; i > test_data; + for(size_t i=0; i arch_buffs; + for(size_t j=0; j both_sigs; + both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end()); + both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end()); + + //now run the test + clock_t start, end; + std::vector profile_times; + for(size_t i = 0; i < arch_list.size(); i++) { + start = clock(); + + switch(both_sigs.size()) { + case 1: + if(inputsc.size() == 0) { + run_cast_test1((volk_gnsssdr_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test1_s32fc((volk_gnsssdr_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. START + else if(inputsc.size() == 1 && !inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. END + else throw "unsupported 1 arg function >1 scalars"; + break; + case 2: + if(inputsc.size() == 0) { + run_cast_test2((volk_gnsssdr_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test2_s32fc((volk_gnsssdr_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. START + else if(inputsc.size() == 1 && !inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. END + else throw "unsupported 2 arg function >1 scalars"; + break; + case 3: + if(inputsc.size() == 0) { + run_cast_test3((volk_gnsssdr_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test3_s32fc((volk_gnsssdr_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. START + else if(inputsc.size() == 1 && !inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. END + else throw "unsupported 3 arg function >1 scalars"; + break; + case 4: + run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + break; + //ADDED BY GNSS-SDR. START + case 8: + if(inputsc.size() == 0) { + run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + else if(inputsc.size() == 1 && !inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + else throw "unsupported 8 arg function >1 scalars"; + break; + case 12: + if(inputsc.size() == 0) { + run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + } else if(inputsc.size() == 1 && inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + else if(inputsc.size() == 1 && !inputsc[0].is_float) { + if(inputsc[0].is_complex) { + run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } else { + run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + else throw "unsupported 12 arg function >1 scalars"; + break; + //ADDED BY GNSS-SDR. END + default: + throw "no function handler for this signature"; + break; + } + + end = clock(); + double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC; + std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl; + if(results) { + volk_gnsssdr_test_time_t result; + result.name = arch_list[i]; + result.time = arch_time; + result.units = "ms"; + results->back().results[result.name] = result; + } + + profile_times.push_back(arch_time); + } + + //and now compare each output to the generic output + //first we have to know which output is the generic one, they aren't in order... + size_t generic_offset=0; + for(size_t i=0; i arch_results; + for(size_t i=0; i::max(); + double best_time_u = std::numeric_limits::max(); + std::string best_arch_a = "generic"; + std::string best_arch_u = "generic"; + for(size_t i=0; i < arch_list.size(); i++) + { + if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) + { + best_time_u = profile_times[i]; + best_arch_u = arch_list[i]; + } + if((profile_times[i] < best_time_a) && arch_results[i]) + { + best_time_a = profile_times[i]; + best_arch_a = arch_list[i]; + } + } + + std::cout << "Best aligned arch: " << best_arch_a << std::endl; + std::cout << "Best unaligned arch: " << best_arch_u << std::endl; + if(results) { + if(puppet_master_name == "NULL") { + results->back().config_name = name; + } else { + results->back().config_name = puppet_master_name; + } + results->back().best_arch_a = best_arch_a; + results->back().best_arch_u = best_arch_u; + } + + return fail_global; +} + + diff --git a/src/algorithms/libs/volk_gnsssdr/lib/qa_utils.h b/src/algorithms/libs/volk_gnsssdr/lib/qa_utils.h new file mode 100644 index 000000000..5e3ff7e88 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/qa_utils.h @@ -0,0 +1,102 @@ +#ifndef VOLK_QA_UTILS_H +#define VOLK_QA_UTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +struct volk_gnsssdr_type_t { + bool is_float; + bool is_scalar; + bool is_signed; + bool is_complex; + int size; + std::string str; +}; + +volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string); + +float uniform(void); +void random_floats(float *buf, unsigned n); + +class volk_gnsssdr_test_time_t { + public: + std::string name; + double time; + std::string units; +}; + +class volk_gnsssdr_test_results_t { + public: + std::string name; + std::string config_name; + int vlen; + int iter; + std::map results; + std::string best_arch_a; + std::string best_arch_u; +}; + +bool run_volk_gnsssdr_tests( + volk_gnsssdr_func_desc_t, + void(*)(), + std::string, + float, + lv_32fc_t, + int, + int, + std::vector *results = NULL, + std::string puppet_master_name = "NULL", + bool benchmark_mode=false, + std::string kernel_regex="" + ); + + +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ + BOOST_AUTO_TEST_CASE(func##_test) { \ + BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ + func##_get_func_desc(), (void (*)())func##_manual, \ + std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ + 0); \ + } +#define VOLK_PROFILE(func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL", bnmode, kernel_regex) +#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func), bnmode, kernel_regex) +typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place +typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input +typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input +typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); + +//ADDED BY GNSS-SDR. START +typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input +typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input +typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); + +typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); + +typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); +//ADDED BY GNSS-SDR. END + + +#endif //VOLK_QA_UTILS_H diff --git a/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc b/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc new file mode 100644 index 000000000..45397c87e --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/testqa.cc @@ -0,0 +1,90 @@ +/* -*- c++ -*- */ +/* + * Copyright 2012-2014 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include "qa_utils.h" +#include +#include + +//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE +VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1); + +//GNSS-SDR PROTO-KERNELS +VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1); + +VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1); + +VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1); + +VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); + +VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 20462, 1); + +VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 20462, 1); +VOLK_RUN_TESTS(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 20462, 1); + + + + + + + +//VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); +//VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000); +//VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_16i, 0, 0, 20462, 10000); +//VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_horizontal_16i, 0, 0, 20462, 10000); +//VOLK_RUN_TESTS(volk_gnsssdr_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000); +//VOLK_RUN_TESTS(volk_gnsssdr_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000); +//VOLK_RUN_TESTS(volk_gnsssdr_16i_32fc_dot_prod_32fc, 1e-4, 0, 204602, 1); +//VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000); +//VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000); +//VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000); +//VOLK_RUN_TESTS(volk_gnsssdr_32u_popcnt, 0, 0, 2046, 10000); +//VOLK_RUN_TESTS(volk_gnsssdr_64u_popcnt, 0, 0, 2046, 10000); diff --git a/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_malloc.c b/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_malloc.c new file mode 100644 index 000000000..03e53a513 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_malloc.c @@ -0,0 +1,142 @@ +/* -*- c -*- */ +/* + * Copyright 2014 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include + +/* + * For #defines used to determine support for allocation functions, + * see: http://linux.die.net/man/3/aligned_alloc +*/ + +// Disabling use of aligned_alloc. This function requires that size be +// a multiple of alignment, which is too restrictive for many uses of +// VOLK. + +//// If we are using C11 standard, use the aligned_alloc +//#ifdef _ISOC11_SOURCE +// +//void *volk_gnsssdr_malloc(size_t size, size_t alignment) +//{ +// void *ptr = aligned_alloc(alignment, size); +// if(ptr == NULL) { +// fprintf(stderr, "VOLK: Error allocating memory (aligned_alloc)\n"); +// } +// return ptr; +//} +// +//void volk_gnsssdr_free(void *ptr) +//{ +// free(ptr); +//} +// +//#else // _ISOC11_SOURCE + +// Otherwise, test if we are a POSIX or X/Open system +// This only has a restriction that alignment be a power of 2. +#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN + +void *volk_gnsssdr_malloc(size_t size, size_t alignment) +{ + void *ptr; + int err = posix_memalign(&ptr, alignment, size); + if(err == 0) { + return ptr; + } + else { + fprintf(stderr, "VOLK: Error allocating memory (posix_memalign: %d)\n", err); + return NULL; + } +} + +void volk_gnsssdr_free(void *ptr) +{ + free(ptr); +} + +// _aligned_malloc has no restriction on size, +// available on Windows since Visual C++ 2005 +#elif _MSC_VER >= 1400 + +void *volk_gnsssdr_malloc(size_t size, size_t alignment) +{ + void *ptr = _aligned_malloc(size, alignment); + if(ptr == NULL) { + fprintf(stderr, "VOLK: Error allocating memory (_aligned_malloc)\n"); + } + return ptr; +} + +void volk_gnsssdr_free(void *ptr) +{ + _aligned_free(ptr); +} + +// No standard handlers; we'll do it ourselves. +#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN + +struct block_info +{ + void *real; +}; + +void * +volk_gnsssdr_malloc(size_t size, size_t alignment) +{ + void *real, *user; + struct block_info *info; + + /* At least align to sizeof our struct */ + if (alignment < sizeof(struct block_info)) + alignment = sizeof(struct block_info); + + /* Alloc */ + real = malloc(size + (2 * alignment - 1)); + + /* Get pointer to the various zones */ + user = (void *)((((uintptr_t) real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1)); + info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info)); + + /* Store the info for the free */ + info->real = real; + + /* Return pointer to user */ + return user; +} + +void +volk_gnsssdr_free(void *ptr) +{ + struct block_info *info; + + /* Get the real pointer */ + info = (struct block_info *)(((uintptr_t)ptr) - sizeof(struct block_info)); + + /* Release real pointer */ + free(info->real); +} + +#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN + +//#endif // _ISOC11_SOURCE diff --git a/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_prefs.c b/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_prefs.c new file mode 100644 index 000000000..dc4dc645e --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_prefs.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include + +//#if defined(_WIN32) +//#include +//#endif + +void volk_gnsssdr_get_config_path(char *path) +{ + const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config"; + char *home = NULL; + if (home == NULL) home = getenv("HOME"); + if (home == NULL) home = getenv("APPDATA"); + if (home == NULL){ + path = NULL; + return; + } + strcpy(path, home); + strcat(path, suffix); +} + +size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **prefs_res) +{ + FILE *config_file; + char path[512], line[512]; + size_t n_arch_prefs = 0; + volk_gnsssdr_arch_pref_t *prefs = NULL; + + //get the config path + volk_gnsssdr_get_config_path(path); + if (path == NULL) return n_arch_prefs; //no prefs found + config_file = fopen(path, "r"); + if(!config_file) return n_arch_prefs; //no prefs found + + //reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs + while(fgets(line, sizeof(line), config_file) != NULL) + { + prefs = (volk_gnsssdr_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs)); + volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs; + if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5)) + { + n_arch_prefs++; + } + } + fclose(config_file); + *prefs_res = prefs; + return n_arch_prefs; +} diff --git a/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c b/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c new file mode 100644 index 000000000..415ca4039 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c @@ -0,0 +1,119 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include +#include +#include +#include +#include + +#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4 + #define __popcnt __builtin_popcount +#else + inline unsigned __popcnt(unsigned num) + { + unsigned pop = 0; + while(num) + { + if (num & 0x1) pop++; + num >>= 1; + } + return pop; + } +#endif + +int volk_gnsssdr_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +){ + unsigned int i; + for (i = 0; i < n_impls; i++) { + if(!strncmp(impl_names[i], impl_name, 20)) { + return i; + } + } + //TODO return -1; + //something terrible should happen here + printf("Volk warning: no arch found, returning generic impl\n"); + return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now +} + +int volk_gnsssdr_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int* impl_deps, //requirement mask per implementation + const bool* alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations +){ + size_t i; + static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs; + static size_t n_arch_prefs = 0; + static int prefs_loaded = 0; + if(!prefs_loaded) { + n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs); + prefs_loaded = 1; + } + + // If we've defined VOLK_GENERIC to be anything, always return the + // 'generic' kernel. Used in GR's QA code. + char *gen_env = getenv("VOLK_GENERIC"); + if(gen_env) { + return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); + } + + //now look for the function name in the prefs list + for(i = 0; i < n_arch_prefs; i++) + { + if(!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it + { + const char *impl_name = align? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u; + return volk_gnsssdr_get_index(impl_names, n_impls, impl_name); + } + } + + //return the best index with the largest deps + size_t best_index_a = 0; + size_t best_index_u = 0; + int best_value_a = -1; + int best_value_u = -1; + for(i = 0; i < n_impls; i++) + { + const signed val = __popcnt(impl_deps[i]); + if (alignment[i] && val > best_value_a) + { + best_index_a = i; + best_value_a = val; + } + if (!alignment[i] && val > best_value_u) + { + best_index_u = i; + best_value_u = val; + } + } + + //when align and we found a best aligned, use it + if (align && best_value_a != -1) return best_index_a; + + //otherwise return the best unaligned + return best_index_u; +} diff --git a/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h b/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h new file mode 100644 index 000000000..6cf9108fb --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h @@ -0,0 +1,50 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_RANK_ARCHS_H +#define INCLUDED_VOLK_RANK_ARCHS_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int volk_gnsssdr_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +); + +int volk_gnsssdr_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int* impl_deps, //requirement mask per implementation + const bool* alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations +); + +#ifdef __cplusplus +} +#endif +#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_16sc_magnitude_32f_aligned16_orc_impl.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_16sc_magnitude_32f_aligned16_orc_impl.orc new file mode 100644 index 000000000..561010761 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_16sc_magnitude_32f_aligned16_orc_impl.orc @@ -0,0 +1,25 @@ +.function volk_gnsssdr_16ic_magnitude_32f_a_orc_impl +.source 4 src +.dest 4 dst +.floatparam 4 scalar +.temp 4 reall +.temp 4 imagl +.temp 2 reals +.temp 2 imags +.temp 4 realf +.temp 4 imagf +.temp 4 sumf + + + +splitlw reals, imags, src +convswl reall, reals +convswl imagl, imags +convlf realf, reall +convlf imagf, imagl +divf realf, realf, scalar +divf imagf, imagf, scalar +mulf realf, realf, realf +mulf imagf, imagf, imagf +addf sumf, realf, imagf +sqrtf dst, sumf diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc new file mode 100644 index 000000000..4419688b6 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc @@ -0,0 +1,5 @@ +.function volk_gnsssdr_32f_x2_add_32f_a_orc_impl +.dest 4 dst +.source 4 src1 +.source 4 src2 +addf dst, src1, src2 diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc new file mode 100644 index 000000000..03297831f --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc @@ -0,0 +1,18 @@ +.function volk_gnsssdr_32fc_s32fc_multiply_32fc_a_orc_impl +.source 8 src1 +.floatparam 8 scalar +.dest 8 dst +.temp 8 iqprod +.temp 4 real +.temp 4 imag +.temp 4 ac +.temp 4 bd +.temp 8 swapped +x2 mulf iqprod, src1, scalar +splitql bd, ac, iqprod +subf real, ac, bd +swaplq swapped, src1 +x2 mulf iqprod, swapped, scalar +splitql bd, ac, iqprod +addf imag, ac, bd +mergelq dst, real, imag diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc new file mode 100644 index 000000000..5d049ad93 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc @@ -0,0 +1,18 @@ +.function volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl +.source 8 src1 +.source 8 src2 +.dest 8 dst +.temp 8 iqprod +.temp 4 real +.temp 4 imag +.temp 4 ac +.temp 4 bd +.temp 8 swapped +x2 mulf iqprod, src1, src2 +splitql bd, ac, iqprod +subf real, ac, bd +swaplq swapped, src1 +x2 mulf iqprod, swapped, src2 +splitql bd, ac, iqprod +addf imag, ac, bd +mergelq dst, real, imag diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc new file mode 100644 index 000000000..71d301c45 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc @@ -0,0 +1,40 @@ +#/*! +# * \file volk_gnsssdr_8i_accumulator_s8i.orc +# * \brief ORC implementation: 8 bits (char) scalar accumulator +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that implements an accumulator of char values +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8i_accumulator_s8i_a_orc_impl +.source 1 src1 +.accumulator 2 acc +.temp 2 sum +mergebw sum, 0, src1 +accw acc, sum diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc new file mode 100644 index 000000000..decb88029 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc @@ -0,0 +1,39 @@ +#/*! +# * \file volk_gnsssdr_8i_x2_add_8i.orc +# * \brief ORC implementation: adds pairs of 8 bits (char) scalars +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that adds pairs of 8 bits (char) scalars +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8i_x2_add_8i_a_orc_impl +.dest 1 dst +.source 1 src1 +.source 1 src2 +addb dst, src1, src2 diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc new file mode 100644 index 000000000..9e14e65f1 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc @@ -0,0 +1,42 @@ +#/*! +# * \file volk_gnsssdr_8ic_conjugate_8ic.orc +# * \brief ORC implementation: calculates the conjugate of a 16 bits vector +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that calculates the conjugate of a +# * 16 bits vector (8 bits the real part and 8 bits the imaginary part) +# * result = (real*real) + (imag*imag) +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl +.source 2 src1 +.dest 2 dst +.temp 2 merged +mergebw merged, 1, -1 +x2 mullb dst, merged, src1 diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc new file mode 100644 index 000000000..a0c40a741 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc @@ -0,0 +1,45 @@ +#/*! +# * \file volk_gnsssdr_8ic_magnitude_squared_8i.orc +# * \brief ORC implementation: calculates the magnitude squared of a 16 bits vector +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that calculates the magnitude squared of a +# * 16 bits vector (8 bits the real part and 8 bits the imaginary part) +# * result = (real*real) + (imag*imag) +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl +.source 2 src1 +.dest 1 dst +.temp 2 iqprod +.temp 1 ac +.temp 1 bd +x2 mullb iqprod, src1, src1 +splitwb bd, ac, iqprod +addb dst, ac, bd diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc new file mode 100644 index 000000000..7c0fc2d6b --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc @@ -0,0 +1,58 @@ +#/*! +# * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.orc +# * \brief ORC implementation: multiplies a group of 16 bits vectors by one constant vector +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that multiplies a group of 16 bits vectors +# * (8 bits the real part and 8 bits the imaginary part) by one constant vector +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl +.source 2 src1 +.param 2 src2real +.param 2 src2imag +.dest 2 dst +.temp 2 iqprod +.temp 1 real +.temp 1 imag +.temp 1 rr +.temp 1 ii +.temp 1 ri +.temp 1 ir +x2 mullb iqprod, src1, src2real +splitwb ir, rr, iqprod +x2 mullb iqprod, src1, src2imag +splitwb ii, ri, iqprod +subb real, rr, ii +addb imag, ri, ir +mergebw dst, real, imag + + + + diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc new file mode 100644 index 000000000..c4dae8840 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc @@ -0,0 +1,59 @@ +#/*! +# * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.orc +# * \brief ORC implementation: multiplies two 16 bits vectors and accumulates them +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that multiplies two 16 bits vectors (8 bits the real part +# * and 8 bits the imaginary part) and accumulates them +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl +.source 2 src1 +.source 2 src2 +.accumulator 2 accreal +.accumulator 2 accimag +.temp 2 iqprod +.temp 1 real +.temp 1 imag +.temp 2 real2 +.temp 2 imag2 +.temp 1 ac +.temp 1 bd +.temp 2 swapped +x2 mullb iqprod, src1, src2 +splitwb bd, ac, iqprod +subb real, ac, bd +swapw swapped, src1 +x2 mullb iqprod, swapped, src2 +splitwb bd, ac, iqprod +addb imag, ac, bd +mergebw real2, 0, real +accw accreal, real2 +mergebw imag2, 0, imag +accw accimag, imag2 diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc new file mode 100644 index 000000000..b448eac0b --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc @@ -0,0 +1,57 @@ +#/*! +# * \file volk_gnsssdr_8ic_x2_multiply_8ic.orc +# * \brief ORC implementation: multiplies two 16 bits vectors +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that multiplies two 16 bits vectors (8 bits the real part +# * and 8 bits the imaginary part) +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl +.source 2 src1 +.source 2 src2 +.dest 2 dst +.temp 2 iqprod +.temp 1 real +.temp 1 imag +.temp 1 ac +.temp 1 bd +.temp 2 swapped +x2 mullb iqprod, src1, src2 +splitwb bd, ac, iqprod +subb real, ac, bd +swapw swapped, src1 +x2 mullb iqprod, swapped, src2 +splitwb bd, ac, iqprod +addb imag, ac, bd +mergebw dst, real, imag + + + + diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc new file mode 100644 index 000000000..29bb09a8c --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc @@ -0,0 +1,139 @@ +#/*! +# * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc +# * \brief ORC implementation: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that performs the carrier wipe-off mixing and the +# * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the +# * real part and 8 bits the imaginary part): +# * - The carrier wipe-off is done by multiplying the input signal by the +# * carrier (multiplication of 16 bits vectors) It returns the input +# * signal in base band (BB) +# * - Early values are calculated by multiplying the input signal in BB by the +# * early code (multiplication of 16 bits vectors), accumulating the results +# * - Prompt values are calculated by multiplying the input signal in BB by the +# * prompt code (multiplication of 16 bits vectors), accumulating the results +# * - Late values are calculated by multiplying the input signal in BB by the +# * late code (multiplication of 16 bits vectors), accumulating the results +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl +.source 2 input +.source 2 carrier +.source 2 E_code +.source 2 P_code +.accumulator 2 E_out_real +.accumulator 2 E_out_imag +.accumulator 2 P_out_real +.accumulator 2 P_out_imag +.temp 2 bb_signal_sample +.temp 2 iqprod +.temp 1 real +.temp 1 imag +.temp 1 ac +.temp 1 bd +.temp 2 swapped + +.temp 2 real2 +.temp 2 imag2 + +x2 mullb iqprod, input, carrier +splitwb bd, ac, iqprod +subb real, ac, bd +swapw swapped, input +x2 mullb iqprod, swapped, carrier +splitwb bd, ac, iqprod +addb imag, ac, bd +mergebw bb_signal_sample, real, imag + +swapw swapped, bb_signal_sample + +x2 mullb iqprod, bb_signal_sample, E_code +splitwb bd, ac, iqprod +subb real, ac, bd +x2 mullb iqprod, swapped, E_code +splitwb bd, ac, iqprod +addb imag, ac, bd +mergebw real2, 0, real +mergebw imag2, 0, imag +accw E_out_real, real2 +accw E_out_imag, imag2 + +x2 mullb iqprod, bb_signal_sample, P_code +splitwb bd, ac, iqprod +subb real, ac, bd +x2 mullb iqprod, swapped, P_code +splitwb bd, ac, iqprod +addb imag, ac, bd +mergebw real2, 0, real +mergebw imag2, 0, imag +accw P_out_real, real2 +accw P_out_imag, imag2 + +.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl +.source 2 input +.source 2 carrier +.source 2 L_code +.accumulator 2 L_out_real +.accumulator 2 L_out_imag + +.temp 2 bb_signal_sample +.temp 2 iqprod +.temp 1 real +.temp 1 imag +.temp 1 ac +.temp 1 bd +.temp 2 swapped + +.temp 2 real2 +.temp 2 imag2 + +x2 mullb iqprod, input, carrier +splitwb bd, ac, iqprod +subb real, ac, bd +swapw swapped, input +x2 mullb iqprod, swapped, carrier +splitwb bd, ac, iqprod +addb imag, ac, bd +mergebw bb_signal_sample, real, imag + +swapw swapped, bb_signal_sample + +x2 mullb iqprod, bb_signal_sample, L_code +splitwb bd, ac, iqprod +subb real, ac, bd +x2 mullb iqprod, swapped, L_code +splitwb bd, ac, iqprod +addb imag, ac, bd +mergebw real2, 0, real +mergebw imag2, 0, imag +accw L_out_real, real2 +accw L_out_imag, imag2 + + diff --git a/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc new file mode 100644 index 000000000..773daabc1 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc @@ -0,0 +1,39 @@ +#/*! +# * \file volk_gnsssdr_8u_x2_multiply_8u.orc +# * \brief ORC implementation: multiplies unsigned char values +# * \authors
    +# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +# *
+# * +# * ORC code that multiplies unsigned char values (8 bits data) +# * +# * ------------------------------------------------------------------------- +# * +# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +# * +# * GNSS-SDR is a software defined Global Navigation +# * Satellite Systems receiver +# * +# * This file is part of GNSS-SDR. +# * +# * GNSS-SDR is free software: you can redistribute it and/or modify +# * it under the terms of the GNU General Public License as published by +# * the Free Software Foundation, either version 3 of the License, or +# * at your option) any later version. +# * +# * GNSS-SDR is distributed in the hope that it will be useful, +# * but WITHOUT ANY WARRANTY; without even the implied warranty of +# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# * GNU General Public License for more details. +# * +# * You should have received a copy of the GNU General Public License +# * along with GNSS-SDR. If not, see . +# * +# * ------------------------------------------------------------------------- +# */ + +.function volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl +.source 1 src1 +.source 1 src2 +.dest 1 dst +mullb dst, src1, src2 diff --git a/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch b/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch new file mode 100644 index 000000000..88bb4fd35 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch @@ -0,0 +1,329 @@ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt +--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 04:26:38.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-17 04:17:37.000000000 +0200 +@@ -517,7 +517,19 @@ if(MSVC) + endif() + + #create the volk_gnsssdr runtime library +-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) ++ ++#MODIFICATIONS BY GNSS-SDR ++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) ++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt) ++ ++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) ++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc}) ++ ++source_group("Kernels" FILES ${h_files}) ++source_group("Common Macros" FILES ${CommonMacros}) ++source_group("ORC Files" FILES ${orc}) ++#END OF MODIFICATIONS ++ + target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries}) + set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER}) + set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS") +diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc +--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 04:26:39.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 04:21:03.000000000 +0200 +@@ -217,6 +217,72 @@ inline void run_cast_test3_s32fc(volk_gn + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + } + ++//ADDED BY GNSS-SDR. START ++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++} ++//ADDED BY GNSS-SDR. END ++ + // This function is a nop that helps resolve GNU Radio bugs 582 and 583. + // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast(float tol) + // won't happen on armhf (reported on cortex A9 and A15). +@@ -426,7 +492,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr + } else { + run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } +- } else throw "unsupported 1 arg function >1 scalars"; ++ } ++ //ADDED BY GNSS-SDR. START ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ //ADDED BY GNSS-SDR. END ++ else throw "unsupported 1 arg function >1 scalars"; + break; + case 2: + if(inputsc.size() == 0) { +@@ -437,7 +513,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr + } else { + run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } +- } else throw "unsupported 2 arg function >1 scalars"; ++ } ++ //ADDED BY GNSS-SDR. START ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ //ADDED BY GNSS-SDR. END ++ else throw "unsupported 2 arg function >1 scalars"; + break; + case 3: + if(inputsc.size() == 0) { +@@ -448,11 +534,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr + } else { + run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } +- } else throw "unsupported 3 arg function >1 scalars"; ++ } ++ //ADDED BY GNSS-SDR. START ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ //ADDED BY GNSS-SDR. END ++ else throw "unsupported 3 arg function >1 scalars"; + break; + case 4: + run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + break; ++ //ADDED BY GNSS-SDR. START ++ case 8: ++ if(inputsc.size() == 0) { ++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ else throw "unsupported 8 arg function >1 scalars"; ++ break; ++ case 12: ++ if(inputsc.size() == 0) { ++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ else throw "unsupported 12 arg function >1 scalars"; ++ break; ++ //ADDED BY GNSS-SDR. END + default: + throw "no function handler for this signature"; + break; +diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h +--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 04:26:39.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-17 04:21:51.000000000 +0200 +@@ -77,4 +77,26 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f + typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); + typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); + ++//ADDED BY GNSS-SDR. START ++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input ++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input ++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++ ++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++ ++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++//ADDED BY GNSS-SDR. END ++ ++ + #endif //VOLK_QA_UTILS_H +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:26:39.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:23:30.000000000 +0200 +@@ -19,8 +19,8 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_VOLK_RUNTIME +-#define INCLUDED_VOLK_RUNTIME ++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME ++#define INCLUDED_VOLK_GNSSSDR_RUNTIME + + #include + #include +@@ -91,4 +91,4 @@ extern VOLK_API volk_gnsssdr_func_desc_t + + __VOLK_DECL_END + +-#endif /*INCLUDED_VOLK_RUNTIME*/ ++#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:26:39.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:22:58.000000000 +0200 +@@ -19,11 +19,11 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H +-#define INCLUDED_VOLK_CONFIG_FIXED_H ++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H ++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H + + #for $i, $arch in enumerate($archs) + #define LV_$(arch.name.upper()) $i + #end for + +-#endif /*INCLUDED_VOLK_CONFIG_FIXED*/ ++#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:26:39.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:23:07.000000000 +0200 +@@ -19,8 +19,8 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_VOLK_CPU_H +-#define INCLUDED_VOLK_CPU_H ++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H ++#define INCLUDED_VOLK_GNSSSDR_CPU_H + + #include + +@@ -39,4 +39,4 @@ unsigned int volk_gnsssdr_get_lvarch (); + + __VOLK_DECL_END + +-#endif /*INCLUDED_VOLK_CPU_H*/ ++#endif /*INCLUDED_VOLK_GNSSSDR_CPU_H*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:26:39.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:23:16.000000000 +0200 +@@ -19,8 +19,8 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_LIBVOLK_MACHINES_H +-#define INCLUDED_LIBVOLK_MACHINES_H ++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H ++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H + + #include + #include +@@ -52,4 +52,4 @@ extern struct volk_gnsssdr_machine volk_ + + __VOLK_DECL_END + +-#endif //INCLUDED_LIBVOLK_MACHINES_H ++#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:26:39.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:23:23.000000000 +0200 +@@ -19,8 +19,8 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_VOLK_TYPEDEFS +-#define INCLUDED_VOLK_TYPEDEFS ++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS ++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS + + #include + #include +@@ -29,4 +29,4 @@ + typedef void (*$(kern.pname))($kern.arglist_types); + #end for + +-#endif /*INCLUDED_VOLK_TYPEDEFS*/ ++#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/ diff --git a/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch b/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch new file mode 100644 index 000000000..82bb1f5ac --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch @@ -0,0 +1,57094 @@ +Binary files /Users/andres/Desktop/volk_gnsssdr/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/.DS_Store differ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc +--- /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc 2014-10-17 05:01:21.000000000 +0200 +@@ -39,7 +39,7 @@ namespace fs = boost::filesystem; + + void write_json(std::ofstream &json_file, std::vector results) { + json_file << "{" << std::endl; +- json_file << " \"volk_gnsssdr_tests\": [" << std::endl; ++ json_file << " \"volk_tests\": [" << std::endl; + size_t len = results.size(); + size_t i = 0; + BOOST_FOREACH(volk_gnsssdr_test_results_t &result, results) { +@@ -48,9 +48,9 @@ void write_json(std::ofstream &json_file + json_file << " \"vlen\": " << result.vlen << "," << std::endl; + json_file << " \"iter\": " << result.iter << "," << std::endl; + json_file << " \"best_arch_a\": \"" << result.best_arch_a +- << "\"," << std::endl; ++ << "\"," << std::endl; + json_file << " \"best_arch_u\": \"" << result.best_arch_u +- << "\"," << std::endl; ++ << "\"," << std::endl; + json_file << " \"results\": {" << std::endl; + size_t results_len = result.results.size(); + size_t ri = 0; +@@ -84,26 +84,26 @@ int main(int argc, char *argv[]) { + // Adding program options + boost::program_options::options_description desc("Options"); + desc.add_options() +- ("help,h", "Print help messages") +- ("benchmark,b", +- boost::program_options::value()->default_value( false ) +- ->implicit_value( true ), +- "Run all kernels (benchmark mode)") +- ("tests-regex,R", +- boost::program_options::value(), +- "Run tests matching regular expression.") +- ("json,j", +- boost::program_options::value(), +- "JSON output file") +- ; +- ++ ("help,h", "Print help messages") ++ ("benchmark,b", ++ boost::program_options::value()->default_value( false ) ++ ->implicit_value( true ), ++ "Run all kernels (benchmark mode)") ++ ("tests-regex,R", ++ boost::program_options::value(), ++ "Run tests matching regular expression.") ++ ("json,j", ++ boost::program_options::value(), ++ "JSON output file") ++ ; ++ + // Handle the options that were given + boost::program_options::variables_map vm; + bool benchmark_mode; + std::string kernel_regex; + bool store_results = true; + std::ofstream json_file; +- ++ + try { + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); + boost::program_options::notify(vm); +@@ -123,20 +123,20 @@ int main(int argc, char *argv[]) { + return 1; + } + /** --help option +-*/ ++ */ + if ( vm.count("help") ) + { +- std::cout << "The VOLK profiler." << std::endl +- << desc << std::endl; +- return 0; ++ std::cout << "The VOLK profiler." << std::endl ++ << desc << std::endl; ++ return 0; + } +- ++ + if ( vm.count("json") ) + { + json_file.open( vm["json"].as().c_str() ); + } +- +- ++ ++ + // Run tests + std::vector results; + +@@ -152,36 +152,84 @@ int main(int argc, char *argv[]) { + //VOLK_PROFILE(volk_gnsssdr_32u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex); + //VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204602, 1000, &results, benchmark_mode, kernel_regex); +- ++ ++ //GNSS-SDR PROTO-KERNELS ++ //lv_32fc_t sfv = lv_cmake((float)1, (float)2); ++ //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ ++ //CAN NOT BE TESTED YET BECAUSE VOLK MODULE DOES NOT SUPPORT IT: ++ //VOLK_PROFILE(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 16007, 1, &results, benchmark_mode, kernel_regex); ++ //VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 7, 1, &results, benchmark_mode, kernel_regex); ++ ++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ ++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ ++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ ++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex); ++ ++ /*VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); ++ VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);*/ ++ + // Until we can update the config on a kernel by kernel basis +- // do not overwrite volk_gnsssdr_config when using a regex. ++ // do not overwrite volk_config when using a regex. + if(store_results) { + char path[1024]; + volk_gnsssdr_get_config_path(path); +- ++ + const fs::path config_path(path); +- ++ + if (not fs::exists(config_path.branch_path())) + { + std::cout << "Creating " << config_path.branch_path() << "..." << std::endl; + fs::create_directories(config_path.branch_path()); + } +- ++ + std::cout << "Writing " << config_path << "..." << std::endl; + std::ofstream config(config_path.string().c_str()); + if(!config.is_open()) { //either we don't have write access or we don't have the dir yet + std::cout << "Error opening file " << config_path << std::endl; + } +- ++ + config << "\ +-#this file is generated by volk_gnsssdr_profile.\n\ +-#the function name is followed by the preferred architecture.\n\ +-"; +- ++ #this file is generated by volk_profile.\n\ ++ #the function name is followed by the preferred architecture.\n\ ++ "; ++ + BOOST_FOREACH(volk_gnsssdr_test_results_t result, results) { + config << result.config_name << " " +- << result.best_arch_a << " " +- << result.best_arch_u << std::endl; ++ << result.best_arch_a << " " ++ << result.best_arch_u << std::endl; + } + config.close(); + } +Binary files /Users/andres/Desktop/volk_gnsssdr/kernels/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/kernels/.DS_Store differ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,174 @@ ++/*! ++ * \file CommonMacros.h ++ * \brief Common macros used inside the volk protokernels. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++#ifndef INCLUDED_gnsssdr_CommonMacros_u_H ++#define INCLUDED_gnsssdr_CommonMacros_u_H ++ ++ #ifdef LV_HAVE_SSE4_1 ++ /*! ++ \brief Macros for U_SSE4_1 ++ */ ++ ++ #ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 ++ #define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\ ++ imag = _mm_srli_si128 (input1, 2);\ ++ imag = _mm_blend_epi16 (input2, imag, 85);\ ++ real = _mm_slli_si128 (input2, 2);\ ++ real = _mm_blend_epi16 (real, input1, 85); ++ #endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */ ++ ++ #ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 ++ #define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\ ++ input_i_1 = _mm_cvtepi16_epi32(input);\ ++ input = _mm_srli_si128 (input, 8);\ ++ input_i_2 = _mm_cvtepi16_epi32(input);\ ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\ ++ output_ps = _mm_cvtepi32_ps(output_i32); ++ #endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ ++ ++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 ++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\ ++ input_i_1 = _mm_cvtepi8_epi32(input);\ ++ input = _mm_srli_si128 (input, 4);\ ++ input_i_2 = _mm_cvtepi8_epi32(input);\ ++ input = _mm_srli_si128 (input, 4);\ ++ output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\ ++ input_i_1 = _mm_cvtepi8_epi32(input);\ ++ input = _mm_srli_si128 (input, 4);\ ++ input_i_2 = _mm_cvtepi8_epi32(input);\ ++ input = _mm_srli_si128 (input, 4);\ ++ output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\ ++ output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\ ++ output_ps = _mm_cvtepi32_ps(output_i32); ++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_SSE2 ++ /*! ++ \brief Macros for U_SSE2 ++ */ ++ ++ #ifdef LV_HAVE_SSSE3 ++ /*! ++ \brief Macros for U_SSSE3 ++ */ ++ ++ #ifndef CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 ++ #define CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)\ ++ y_aux = _mm_sign_epi8 (y, x);\ ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);\ ++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);\ ++ \ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);\ ++ y_aux = _mm_sign_epi8 (y_aux, x);\ ++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); ++ #endif /* CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 */ ++ ++ #endif /* LV_HAVE_SSSE3 */ ++ ++ #ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 ++ #define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);\ ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\ ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\ ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\ ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ #endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */ ++ ++ #ifndef CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 ++ #define CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(input, mult1, real, imag)\ ++ imag = _mm_srli_si128 (input, 1);\ ++ imag = _mm_and_si128 (imag, mult1);\ ++ real = _mm_and_si128 (input, mult1); ++ #endif /* CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 */ ++ ++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 ++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(input, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\ ++ input_i_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), input);\ ++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\ ++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\ ++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\ ++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\ ++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\ ++ output_ps_1 = _mm_cvtepi32_ps(output_i32);\ ++ \ ++ input_i_1 = _mm_unpackhi_epi8(_mm_setzero_si128(), input);\ ++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\ ++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\ ++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\ ++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\ ++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\ ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 */ ++ ++ #ifndef CM_8IC_CONTROLMINUS128_8IC_U_SSE2 ++ #define CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\ ++ minus128control = _mm_cmpeq_epi8 (y, minus128);\ ++ y = _mm_sub_epi8 (y, minus128control); ++ #endif /* CM_8IC_CONTROLMINUS128_8IC_U_SSE2 */ ++ ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Macros for U_GENERIC ++ */ ++ ++ #endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_CommonMacros_a_H ++#define INCLUDED_gnsssdr_CommonMacros_a_H ++ ++ #ifdef LV_HAVE_SSE4_1 ++ /*! ++ \brief Macros for A_SSE4_1 ++ */ ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_SSE2 ++ /*! ++ \brief Macros for U_SSE2 ++ */ ++ ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Macros for A_GENERIC ++ */ ++ ++ #endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,76 @@ ++/*! ++ * \file CommonMacros_16ic_cw_corr_32fc.h ++ * \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H ++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H ++#include "CommonMacros/CommonMacros.h" ++ ++ #ifdef LV_HAVE_SSE4_1 ++ /*! ++ \brief Macros for U_SSE4_1 ++ */ ++ ++ #ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 ++ #define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Macros for U_GENERIC ++ */ ++ ++ #endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H ++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H ++ ++ #ifdef LV_HAVE_SSE4_1 ++ /*! ++ \brief Macros for A_SSE4_1 ++ */ ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Macros for A_GENERIC ++ */ ++ ++ #endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,114 @@ ++/*! ++ * \file CommonMacros_8ic_cw_corr_32fc.h ++ * \brief Common macros used inside the 8ic_cw_corr_32fc volk protokernels. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H ++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H ++#include "CommonMacros/CommonMacros.h" ++ ++ #ifdef LV_HAVE_SSE4_1 ++ /*! ++ \brief Macros for U_SSE4_1 ++ */ ++ ++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1 ++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ ++ \ ++ imag_output = _mm_slli_si128 (imag_output, 1);\ ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);\ ++ \ ++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ ++ ++ #ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 ++ #define CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ ++ CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\ ++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++ #endif /* CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 */ ++ ++ #ifndef CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 ++ #define CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ ++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++ #endif /* CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 */ ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_SSE2 ++ /*! ++ \brief Macros for U_SSE2 ++ */ ++ ++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 ++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ ++ \ ++ real_output = _mm_and_si128 (real_output, mult1);\ ++ imag_output = _mm_and_si128 (imag_output, mult1);\ ++ imag_output = _mm_slli_si128 (imag_output, 1);\ ++ output = _mm_or_si128 (real_output, imag_output);\ ++ \ ++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ #endif /* CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 */ ++ ++ #endif /* LV_HAVE_SSE2 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Macros for U_GENERIC ++ */ ++ ++ #endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H ++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H ++ ++ #ifdef LV_HAVE_SSE4_1 ++ /*! ++ \brief Macros for A_SSE4_1 ++ */ ++ ++ #endif /* LV_HAVE_SSE4_1 */ ++ ++ #ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Macros for A_GENERIC ++ */ ++ ++ #endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt +--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,34 @@ ++#################################################################### ++Common Macros inside volk_gnsssdr module ++#################################################################### ++ ++First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties. ++Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples. ++ ++#################################################################### ++Syntax ++#################################################################### ++ ++In order to allow better understanding of the code I created the macros with an specific syntax. ++ ++1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example: ++ ++example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) ++ ++First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output). ++The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile. ++ ++2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h ++ ++#################################################################### ++Workflow ++#################################################################### ++ ++In order to use the macros easily, I usually test the code without macros inside a testing proto-kernel, where you are able to test it, debug it and use breakpoints. ++When it works I place code inside a macro an I test it again. ++ ++#################################################################### ++Why macros ++#################################################################### ++1) They are the only way I could find for sharing code between proto-kernels without performance penalty. ++2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it. +\ No newline at end of file +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,241 @@ ++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H ++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H ++ ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++ ++ /*! ++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++ \param inputVector The 16 bit input data buffer ++ \param outputVector The floating point output data buffer ++ \param scalar The value divided against each point in the output buffer ++ \param num_points The number of data values to be converted ++ \note Output buffer does NOT need to be properly aligned ++ */ ++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int eighthPoints = num_points / 8; ++ ++ float* outputVectorPtr = outputVector; ++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++ int16_t* inputPtr = (int16_t*)inputVector; ++ __m128i inputVal; ++ __m128i inputVal2; ++ __m128 ret; ++ ++ for(;number < eighthPoints; number++){ ++ ++ // Load the 8 values ++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++ // Shift the input data to the right by 64 bits ( 8 bytes ) ++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++ // Convert the lower 4 values into 32 bit words ++ inputVal = _mm_cvtepi16_epi32(inputVal); ++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++ ret = _mm_cvtepi32_ps(inputVal); ++ ret = _mm_mul_ps(ret, invScalar); ++ _mm_storeu_ps(outputVectorPtr, ret); ++ outputVectorPtr += 4; ++ ++ ret = _mm_cvtepi32_ps(inputVal2); ++ ret = _mm_mul_ps(ret, invScalar); ++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++ outputVectorPtr += 4; ++ ++ inputPtr += 8; ++ } ++ ++ number = eighthPoints * 8; ++ for(; number < num_points; number++){ ++ outputVector[number] =((float)(inputVector[number])) / scalar; ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE ++#include ++ ++ /*! ++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++ \param inputVector The 16 bit input data buffer ++ \param outputVector The floating point output data buffer ++ \param scalar The value divided against each point in the output buffer ++ \param num_points The number of data values to be converted ++ \note Output buffer does NOT need to be properly aligned ++ */ ++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ float* outputVectorPtr = outputVector; ++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++ int16_t* inputPtr = (int16_t*)inputVector; ++ __m128 ret; ++ ++ for(;number < quarterPoints; number++){ ++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); ++ ++ ret = _mm_mul_ps(ret, invScalar); ++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++ inputPtr += 4; ++ outputVectorPtr += 4; ++ } ++ ++ number = quarterPoints * 4; ++ for(; number < num_points; number++){ ++ outputVector[number] = (float)(inputVector[number]) / scalar; ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++ \param inputVector The 16 bit input data buffer ++ \param outputVector The floating point output data buffer ++ \param scalar The value divided against each point in the output buffer ++ \param num_points The number of data values to be converted ++ \note Output buffer does NOT need to be properly aligned ++ */ ++static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++ float* outputVectorPtr = outputVector; ++ const int16_t* inputVectorPtr = inputVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */ ++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H ++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H ++ ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++ ++ /*! ++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++ \param inputVector The 16 bit input data buffer ++ \param outputVector The floating point output data buffer ++ \param scalar The value divided against each point in the output buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int eighthPoints = num_points / 8; ++ ++ float* outputVectorPtr = outputVector; ++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++ int16_t* inputPtr = (int16_t*)inputVector; ++ __m128i inputVal; ++ __m128i inputVal2; ++ __m128 ret; ++ ++ for(;number < eighthPoints; number++){ ++ ++ // Load the 8 values ++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++ ++ // Shift the input data to the right by 64 bits ( 8 bytes ) ++ inputVal2 = _mm_srli_si128(inputVal, 8); ++ ++ // Convert the lower 4 values into 32 bit words ++ inputVal = _mm_cvtepi16_epi32(inputVal); ++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++ ++ ret = _mm_cvtepi32_ps(inputVal); ++ ret = _mm_mul_ps(ret, invScalar); ++ _mm_storeu_ps(outputVectorPtr, ret); ++ outputVectorPtr += 4; ++ ++ ret = _mm_cvtepi32_ps(inputVal2); ++ ret = _mm_mul_ps(ret, invScalar); ++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++ outputVectorPtr += 4; ++ ++ inputPtr += 8; ++ } ++ ++ number = eighthPoints * 8; ++ for(; number < num_points; number++){ ++ outputVector[number] =((float)(inputVector[number])) / scalar; ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE ++#include ++ ++ /*! ++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++ \param inputVector The 16 bit input data buffer ++ \param outputVector The floating point output data buffer ++ \param scalar The value divided against each point in the output buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ float* outputVectorPtr = outputVector; ++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++ int16_t* inputPtr = (int16_t*)inputVector; ++ __m128 ret; ++ ++ for(;number < quarterPoints; number++){ ++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); ++ ++ ret = _mm_mul_ps(ret, invScalar); ++ _mm_storeu_ps(outputVectorPtr, ret); ++ ++ inputPtr += 4; ++ outputVectorPtr += 4; ++ } ++ ++ number = quarterPoints * 4; ++ for(; number < num_points; number++){ ++ outputVector[number] = (float)(inputVector[number]) / scalar; ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++ \param inputVector The 16 bit input data buffer ++ \param outputVector The floating point output data buffer ++ \param scalar The value divided against each point in the output buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++ float* outputVectorPtr = outputVector; ++ const int16_t* inputVectorPtr = inputVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,461 @@ ++/*! ++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the ++ * real part and 16 bits the imaginary part): ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 32 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 32 bits vectors), accumulating the results ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 32 bits vectors), accumulating the results ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 32 bits vectors), accumulating the results ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++ /*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++ ++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++ input_ptr += 4; ++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++ ++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ carrier_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get early values ++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ E_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ //Adds the float 32 results ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ P_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ L_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ input_ptr += 4; ++ carrier_ptr += 4; ++ E_code_ptr += 4; ++ P_code_ptr += 4; ++ L_code_ptr += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ } ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++ ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ lv_16sc_t bb_signal_sample; ++ lv_16sc_t tmp1; ++ lv_16sc_t tmp2; ++ lv_16sc_t tmp3; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ ++ tmp1 = bb_signal_sample * E_code[i]; ++ tmp2 = bb_signal_sample * P_code[i]; ++ tmp3 = bb_signal_sample * L_code[i]; ++ ++ // Now get early, late, and prompt values for each ++ *E_out += (lv_32fc_t)tmp1; ++ *P_out += (lv_32fc_t)tmp2; ++ *L_out += (lv_32fc_t)tmp3; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++ ++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x1 = _mm_load_si128((__m128i*)input_ptr); ++ input_ptr += 4; ++ x2 = _mm_load_si128((__m128i*)input_ptr); ++ ++ y1 = _mm_load_si128((__m128i*)carrier_ptr); ++ carrier_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get early values ++ y1 = _mm_load_si128((__m128i*)E_code_ptr); ++ E_code_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ //Adds the float 32 results ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y1 = _mm_load_si128((__m128i*)P_code_ptr); ++ P_code_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y1 = _mm_load_si128((__m128i*)L_code_ptr); ++ L_code_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ input_ptr += 4; ++ carrier_ptr += 4; ++ E_code_ptr += 4; ++ P_code_ptr += 4; ++ L_code_ptr += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ ++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ } ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++ ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ lv_16sc_t bb_signal_sample; ++ lv_16sc_t tmp1; ++ lv_16sc_t tmp2; ++ lv_16sc_t tmp3; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ ++ tmp1 = bb_signal_sample * E_code[i]; ++ tmp2 = bb_signal_sample * P_code[i]; ++ tmp3 = bb_signal_sample * L_code[i]; ++ ++ // Now get early, late, and prompt values for each ++ *E_out += (lv_32fc_t)tmp1; ++ *P_out += (lv_32fc_t)tmp2; ++ *L_out += (lv_32fc_t)tmp3; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,1568 @@ ++/*! ++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the ++ * real part and 16 bits the imaginary part): ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 32 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 32 bits vectors), accumulating the results ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 32 bits vectors), accumulating the results ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 32 bits vectors), accumulating the results ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++ /*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_first(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 4; ++ ++ __m128i x, y, yaux, yl, yh, tmp1, tmp2, z, bb_signal_sample, bb_signal_sample_suffled; ++ ++ __m128 z_ps_1, z_ps_2, z_E, z_P, z_L; ++ __m128i z_i_1, z_i_2; ++ ++ lv_32fc_t dotProduct_E; ++ lv_32fc_t dotProduct_P; ++ lv_32fc_t dotProduct_L; ++ ++ z_E = _mm_setzero_ps(); ++ z_P = _mm_setzero_ps(); ++ z_L = _mm_setzero_ps(); ++ ++ const lv_16sc_t* _input = input; ++ const lv_16sc_t* _carrier = carrier; ++ const lv_16sc_t* _E_code = E_code; ++ const lv_16sc_t* _P_code = P_code; ++ const lv_16sc_t* _L_code = L_code; ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++) ++ { ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_lddqu_si128((__m128i*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ // Load yl with cr,cr,dr,dr ++ // Load yh with ci,ci,di,di ++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); ++ yl = _mm_unpacklo_epi16(yaux, yaux); ++ yh = _mm_unpackhi_epi16(yaux, yaux); ++ ++ tmp1 = _mm_mullo_epi16(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_epi8 (x, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mullo_epi16(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); ++ bb_signal_sample = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ bb_signal_sample_suffled = _mm_shuffle_epi8 (bb_signal_sample, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++ ++ // correlation E,P,L (3x vector scalar product) ++ // Early ++ y = _mm_lddqu_si128((__m128i*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); ++ yl = _mm_unpacklo_epi16(yaux, yaux); ++ yh = _mm_unpackhi_epi16(yaux, yaux); ++ ++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); ++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_i_1 = _mm_cvtepi16_epi32(z); ++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); ++ z = _mm_srli_si128 (z, 8); ++ z_i_2 = _mm_cvtepi16_epi32(z); ++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); ++ ++ z_E = _mm_add_ps(z_E, z_ps_1); // Add the complex multiplication results together ++ z_E = _mm_add_ps(z_E, z_ps_2); // Add the complex multiplication results together ++ ++ // Prompt ++ y = _mm_lddqu_si128((__m128i*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); ++ yl = _mm_unpacklo_epi16(yaux, yaux); ++ yh = _mm_unpackhi_epi16(yaux, yaux); ++ ++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); ++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_i_1 = _mm_cvtepi16_epi32(z); ++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); ++ z = _mm_srli_si128 (z, 8); ++ z_i_2 = _mm_cvtepi16_epi32(z); ++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); ++ ++ z_P = _mm_add_ps(z_P, z_ps_1); // Add the complex multiplication results together ++ z_P = _mm_add_ps(z_P, z_ps_2); // Add the complex multiplication results together ++ ++ // Late ++ y = _mm_lddqu_si128((__m128i*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); ++ yl = _mm_unpacklo_epi16(yaux, yaux); ++ yh = _mm_unpackhi_epi16(yaux, yaux); ++ ++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); ++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_i_1 = _mm_cvtepi16_epi32(z); ++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); ++ z = _mm_srli_si128 (z, 8); ++ z_i_2 = _mm_cvtepi16_epi32(z); ++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); ++ ++ z_L = _mm_add_ps(z_L, z_ps_1); // Add the complex multiplication results together ++ z_L = _mm_add_ps(z_L, z_ps_2); // Add the complex multiplication results together ++ ++ _input += 4; ++ _carrier += 4; ++ _E_code += 4; ++ _L_code += 4; ++ _P_code += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++ ++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++ ++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); ++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); ++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); ++ } ++ ++ for(int i=0; i < num_points%4; ++i) ++ { ++ dotProduct_E += (lv_32fc_t)((*_input) * (*_E_code++)*(*_carrier)); ++ dotProduct_P += (lv_32fc_t)((*_input) * (*_P_code++)*(*_carrier)); ++ dotProduct_L += (lv_32fc_t)((*_input++) * (*_L_code++)*(*_carrier++)); ++ } ++ ++ *E_out = dotProduct_E; ++ *P_out = dotProduct_P; ++ *L_out = dotProduct_L; ++ ++ ++ ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++ ++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; ++ __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; ++ ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++ input_ptr += 4; ++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++ ++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ carrier_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ imagx = _mm_srli_si128 (x1, 2); ++ imagx = _mm_blend_epi16 (x2, imagx, 85); ++ realx = _mm_slli_si128 (x2, 2); ++ realx = _mm_blend_epi16 (realx, x1, 85); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ //Get early values ++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ E_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); ++ ++ //Get prompt values ++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ P_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); ++ ++ //Get late values ++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ L_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); ++ ++ input_ptr += 4; ++ carrier_ptr += 4; ++ E_code_ptr += 4; ++ L_code_ptr += 4; ++ P_code_ptr += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ } ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ unsigned int index = 0; ++ unsigned int indexPlus4 = 0; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; ++ ++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; ++ __m128 real_output_ps, imag_output_ps; ++ ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(index = 0;index < 8*sse_iters; index+=8){ ++ indexPlus4 = index + 4; ++ //Perform the carrier wipe-off ++ x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]); ++ x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]); ++ ++ y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]); ++ y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]); ++ ++ imagx = _mm_srli_si128 (x1, 2); ++ imagx = _mm_blend_epi16 (x2, imagx, 85); ++ realx = _mm_slli_si128 (x2, 2); ++ realx = _mm_blend_epi16 (realx, x1, 85); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ //Get early values ++ y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]); ++ y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]); ++ y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]); ++ y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ } ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(; index < num_points; index++) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input_ptr[index] * carrier_ptr[index]; ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; ++ ++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; ++ __m128 real_output_ps, imag_output_ps; ++ ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++ input_ptr += 4; ++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++ ++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ carrier_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ imagx = _mm_srli_si128 (x1, 2); ++ imagx = _mm_blend_epi16 (x2, imagx, 85); ++ realx = _mm_slli_si128 (x2, 2); ++ realx = _mm_blend_epi16 (realx, x1, 85); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ //Get early values ++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ E_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ P_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ L_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ imagy = _mm_srli_si128 (y1, 2); ++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++ realy = _mm_slli_si128 (y2, 2); ++ realy = _mm_blend_epi16 (realy, y1, 85); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ input_ptr += 4; ++ carrier_ptr += 4; ++ E_code_ptr += 4; ++ L_code_ptr += 4; ++ P_code_ptr += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ } ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; ++ __m128i input_i_1, input_i_2, output_i32; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i realx, imagx, realy, imagy, real_output, imag_output; ++ ++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++ __m128 real_output_ps, imag_output_ps; ++ ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++ input_ptr += 4; ++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++ ++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ carrier_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get early values ++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ E_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) ++ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ P_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) ++ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ L_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) ++ ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) ++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ input_ptr += 4; ++ carrier_ptr += 4; ++ E_code_ptr += 4; ++ L_code_ptr += 4; ++ P_code_ptr += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ } ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; ++ __m128i input_i_1, input_i_2, output_i32; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i realx, imagx, realy, imagy, real_output, imag_output; ++ ++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++ __m128 real_output_ps, imag_output_ps; ++ ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++ input_ptr += 4; ++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++ ++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ carrier_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get early values ++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ E_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ P_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ L_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ input_ptr += 4; ++ carrier_ptr += 4; ++ E_code_ptr += 4; ++ L_code_ptr += 4; ++ P_code_ptr += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ } ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ lv_16sc_t bb_signal_sample; ++ lv_16sc_t tmp1; ++ lv_16sc_t tmp2; ++ lv_16sc_t tmp3; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ ++ tmp1 = bb_signal_sample * E_code[i]; ++ tmp2 = bb_signal_sample * P_code[i]; ++ tmp3 = bb_signal_sample * L_code[i]; ++ ++ // Now get early, late, and prompt values for each ++ *E_out += (lv_32fc_t)tmp1; ++ *P_out += (lv_32fc_t)tmp2; ++ *L_out += (lv_32fc_t)tmp3; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H ++ ++#include ++#include ++#include ++#include ++#include ++// ++//#ifdef LV_HAVE_SSE4_1 ++//#include "smmintrin.h" ++///*! ++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++// \param input The input signal input ++// \param carrier The carrier signal input ++// \param E_code Early PRN code replica input ++// \param P_code Early PRN code replica input ++// \param L_code Early PRN code replica input ++// \param E_out Early correlation output ++// \param P_out Early correlation output ++// \param L_out Early correlation output ++// \param num_points The number of complex values in vectors ++// */ ++//static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++//{ ++// const unsigned int sse_iters = num_points / 8; ++// ++// __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++// ++// __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++// __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; ++// __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; ++// ++// float E_out_real = 0; ++// float E_out_imag = 0; ++// float P_out_real = 0; ++// float P_out_imag = 0; ++// float L_out_real = 0; ++// float L_out_imag = 0; ++// ++// const lv_16sc_t* input_ptr = input; ++// const lv_16sc_t* carrier_ptr = carrier; ++// ++// const lv_16sc_t* E_code_ptr = E_code; ++// lv_32fc_t* E_out_ptr = E_out; ++// const lv_16sc_t* L_code_ptr = L_code; ++// lv_32fc_t* L_out_ptr = L_out; ++// const lv_16sc_t* P_code_ptr = P_code; ++// lv_32fc_t* P_out_ptr = P_out; ++// ++// *E_out_ptr = 0; ++// *P_out_ptr = 0; ++// *L_out_ptr = 0; ++// ++// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++// ++// real_E_code_acc = _mm_setzero_ps(); ++// imag_E_code_acc = _mm_setzero_ps(); ++// real_P_code_acc = _mm_setzero_ps(); ++// imag_P_code_acc = _mm_setzero_ps(); ++// real_L_code_acc = _mm_setzero_ps(); ++// imag_L_code_acc = _mm_setzero_ps(); ++// ++// if (sse_iters>0) ++// { ++// for(int number = 0;number < sse_iters; number++){ ++// ++// //Perform the carrier wipe-off ++// x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++// input_ptr += 4; ++// x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++// ++// y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++// carrier_ptr += 4; ++// y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++// ++// imagx = _mm_srli_si128 (x1, 2); ++// imagx = _mm_blend_epi16 (x2, imagx, 85); ++// realx = _mm_slli_si128 (x2, 2); ++// realx = _mm_blend_epi16 (realx, x1, 85); ++// ++// imagy = _mm_srli_si128 (y1, 2); ++// imagy = _mm_blend_epi16 (y2, imagy, 85); ++// realy = _mm_slli_si128 (y2, 2); ++// realy = _mm_blend_epi16 (realy, y1, 85); ++// ++// realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++// ++// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++// ++// //Get early values ++// y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++// E_code_ptr += 4; ++// y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++// ++// imagy = _mm_srli_si128 (y1, 2); ++// imagy = _mm_blend_epi16 (y2, imagy, 85); ++// realy = _mm_slli_si128 (y2, 2); ++// realy = _mm_blend_epi16 (realy, y1, 85); ++// ++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++// ++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++// ++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++// real_output = _mm_srli_si128 (real_output, 8); ++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++// ++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++// imag_output = _mm_srli_si128 (imag_output, 8); ++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++// ++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); ++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); ++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); ++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); ++// ++// //Get prompt values ++// y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++// P_code_ptr += 4; ++// y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++// ++// imagy = _mm_srli_si128 (y1, 2); ++// imagy = _mm_blend_epi16 (y2, imagy, 85); ++// realy = _mm_slli_si128 (y2, 2); ++// realy = _mm_blend_epi16 (realy, y1, 85); ++// ++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++// ++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++// ++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++// real_output = _mm_srli_si128 (real_output, 8); ++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++// ++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++// imag_output = _mm_srli_si128 (imag_output, 8); ++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++// ++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); ++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); ++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); ++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); ++// ++// //Get late values ++// y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++// L_code_ptr += 4; ++// y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++// ++// imagy = _mm_srli_si128 (y1, 2); ++// imagy = _mm_blend_epi16 (y2, imagy, 85); ++// realy = _mm_slli_si128 (y2, 2); ++// realy = _mm_blend_epi16 (realy, y1, 85); ++// ++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++// ++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++// ++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++// real_output = _mm_srli_si128 (real_output, 8); ++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++// ++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++// imag_output = _mm_srli_si128 (imag_output, 8); ++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++// ++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); ++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); ++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); ++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); ++// ++// input_ptr += 4; ++// carrier_ptr += 4; ++// E_code_ptr += 4; ++// L_code_ptr += 4; ++// P_code_ptr += 4; ++// } ++// ++// __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++// __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++// __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++// __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++// __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++// __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++// ++// _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++// _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++// _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++// _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++// _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++// _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++// ++// for (int i = 0; i<4; ++i) ++// { ++// E_out_real += real_E_dotProductVector[i]; ++// E_out_imag += imag_E_dotProductVector[i]; ++// P_out_real += real_P_dotProductVector[i]; ++// P_out_imag += imag_P_dotProductVector[i]; ++// L_out_real += real_L_dotProductVector[i]; ++// L_out_imag += imag_L_dotProductVector[i]; ++// } ++// *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++// *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++// *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++// } ++// ++// lv_16sc_t bb_signal_sample; ++// for(int i=0; i < num_points%8; ++i) ++// { ++// //Perform the carrier wipe-off ++// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++// // Now get early, late, and prompt values for each ++// *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++// *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++// *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++// } ++//} ++//#endif /* LV_HAVE_SSE4_1 */ ++// ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++{ ++ lv_16sc_t bb_signal_sample; ++ lv_16sc_t tmp1; ++ lv_16sc_t tmp2; ++ lv_16sc_t tmp3; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ ++ tmp1 = bb_signal_sample * E_code[i]; ++ tmp2 = bb_signal_sample * P_code[i]; ++ tmp3 = bb_signal_sample * L_code[i]; ++ ++ // Now get early, late, and prompt values for each ++ *E_out += (lv_32fc_t)tmp1; ++ *P_out += (lv_32fc_t)tmp2; ++ *L_out += (lv_32fc_t)tmp3; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,595 @@ ++/*! ++ * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the ++ * real part and 16 bits the imaginary part) and accumulates into float32 values, returning them: ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 32 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Very Early values are calculated by multiplying the input signal in BB by the ++ * very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++ * - Very Late values are calculated by multiplying the input signal in BB by the ++ * very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++ /*! ++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++ ++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ ++ float VE_out_real = 0; ++ float VE_out_imag = 0; ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ float VL_out_real = 0; ++ float VL_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_16sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ *VE_out_ptr = 0; ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ *VL_out_ptr = 0; ++ ++ real_VE_code_acc = _mm_setzero_ps(); ++ imag_VE_code_acc = _mm_setzero_ps(); ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ real_VL_code_acc = _mm_setzero_ps(); ++ imag_VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++ input_ptr += 4; ++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++ ++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ carrier_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get very early values ++ y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ VE_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++ ++ //Get early values ++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ E_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ P_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ L_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ //Get very late values ++ y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ VL_code_ptr += 4; ++ y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++ ++ input_ptr += 4; ++ carrier_ptr += 4; ++ VE_code_ptr += 4; ++ E_code_ptr += 4; ++ P_code_ptr += 4; ++ L_code_ptr += 4; ++ VL_code_ptr += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ VE_out_real += real_VE_dotProductVector[i]; ++ VE_out_imag += imag_VE_dotProductVector[i]; ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ VL_out_real += real_VL_dotProductVector[i]; ++ VL_out_imag += imag_VL_dotProductVector[i]; ++ } ++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++ } ++ ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) ++{ ++ lv_16sc_t bb_signal_sample; ++ lv_16sc_t tmp1; ++ lv_16sc_t tmp2; ++ lv_16sc_t tmp3; ++ lv_16sc_t tmp4; ++ lv_16sc_t tmp5; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ // perform Early, Prompt and Late correlation ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ ++ tmp1 = bb_signal_sample * VE_code[i]; ++ tmp2 = bb_signal_sample * E_code[i]; ++ tmp3 = bb_signal_sample * P_code[i]; ++ tmp4 = bb_signal_sample * L_code[i]; ++ tmp5 = bb_signal_sample * VL_code[i]; ++ ++ // Now get early, late, and prompt values for each ++ *VE_out += (lv_32fc_t)tmp1; ++ *E_out += (lv_32fc_t)tmp2; ++ *P_out += (lv_32fc_t)tmp3; ++ *L_out += (lv_32fc_t)tmp4; ++ *VL_out += (lv_32fc_t)tmp5; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++ ++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ ++ float VE_out_real = 0; ++ float VE_out_imag = 0; ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ float VL_out_real = 0; ++ float VL_out_imag = 0; ++ ++ const lv_16sc_t* input_ptr = input; ++ const lv_16sc_t* carrier_ptr = carrier; ++ ++ const lv_16sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_16sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_16sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_16sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_16sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ *VE_out_ptr = 0; ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ *VL_out_ptr = 0; ++ ++ real_VE_code_acc = _mm_setzero_ps(); ++ imag_VE_code_acc = _mm_setzero_ps(); ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ real_VL_code_acc = _mm_setzero_ps(); ++ imag_VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x1 = _mm_load_si128((__m128i*)input_ptr); ++ input_ptr += 4; ++ x2 = _mm_load_si128((__m128i*)input_ptr); ++ ++ y1 = _mm_load_si128((__m128i*)carrier_ptr); ++ carrier_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get very early values ++ y1 = _mm_load_si128((__m128i*)VE_code_ptr); ++ VE_code_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)VE_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++ ++ //Get early values ++ y1 = _mm_load_si128((__m128i*)E_code_ptr); ++ E_code_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y1 = _mm_load_si128((__m128i*)P_code_ptr); ++ P_code_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y1 = _mm_load_si128((__m128i*)L_code_ptr); ++ L_code_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ //Get very late values ++ y1 = _mm_load_si128((__m128i*)VL_code_ptr); ++ VL_code_ptr += 4; ++ y2 = _mm_load_si128((__m128i*)VL_code_ptr); ++ ++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++ ++ input_ptr += 4; ++ carrier_ptr += 4; ++ VE_code_ptr += 4; ++ E_code_ptr += 4; ++ P_code_ptr += 4; ++ L_code_ptr += 4; ++ VL_code_ptr += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++ ++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ VE_out_real += real_VE_dotProductVector[i]; ++ VE_out_imag += imag_VE_dotProductVector[i]; ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ VL_out_real += real_VL_dotProductVector[i]; ++ VL_out_imag += imag_VL_dotProductVector[i]; ++ } ++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++ } ++ ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) ++{ ++ lv_16sc_t bb_signal_sample; ++ lv_16sc_t tmp1; ++ lv_16sc_t tmp2; ++ lv_16sc_t tmp3; ++ lv_16sc_t tmp4; ++ lv_16sc_t tmp5; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ // perform Early, Prompt and Late correlation ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ ++ tmp1 = bb_signal_sample * VE_code[i]; ++ tmp2 = bb_signal_sample * E_code[i]; ++ tmp3 = bb_signal_sample * P_code[i]; ++ tmp4 = bb_signal_sample * L_code[i]; ++ tmp5 = bb_signal_sample * VL_code[i]; ++ ++ // Now get early, late, and prompt values for each ++ *VE_out += (lv_32fc_t)tmp1; ++ *E_out += (lv_32fc_t)tmp2; ++ *P_out += (lv_32fc_t)tmp3; ++ *L_out += (lv_32fc_t)tmp4; ++ *VL_out += (lv_32fc_t)tmp5; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,68 @@ ++#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H ++#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++*/ ++static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){ ++ float returnValue = 0; ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ const float* aPtr = inputBuffer; ++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; ++ ++ __m128 accumulator = _mm_setzero_ps(); ++ __m128 aVal = _mm_setzero_ps(); ++ ++ for(;number < quarterPoints; number++){ ++ aVal = _mm_load_ps(aPtr); ++ accumulator = _mm_add_ps(accumulator, aVal); ++ aPtr += 4; ++ } ++ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container ++ returnValue = tempBuffer[0]; ++ returnValue += tempBuffer[1]; ++ returnValue += tempBuffer[2]; ++ returnValue += tempBuffer[3]; ++ ++ number = quarterPoints * 4; ++ for(;number < num_points; number++){ ++ returnValue += (*aPtr++); ++ } ++ *result = returnValue; ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++*/ ++static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){ ++ const float* aPtr = inputBuffer; ++ unsigned int number = 0; ++ float returnValue = 0; ++ ++ for(;number < num_points; number++){ ++ returnValue += (*aPtr++); ++ } ++ *result = returnValue; ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,149 @@ ++#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H ++#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++ ++static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); ++ ++ float max = src0[0]; ++ float index = 0; ++ __m128 maxValues = _mm_set1_ps(max); ++ __m128 maxValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++ ++ for(;number < quarterPoints; number++){ ++ ++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++ compareResults = _mm_cmpgt_ps(maxValues, currentValues); ++ ++ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); ++ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); ++ } ++ ++ // Calculate the largest value from the remaining 4 points ++ _mm_store_ps(maxValuesBuffer, maxValues); ++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); ++ ++ for(number = 0; number < 4; number++){ ++ if(maxValuesBuffer[number] > max){ ++ index = maxIndexesBuffer[number]; ++ max = maxValuesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for(;number < num_points; number++){ ++ if(src0[number] > max){ ++ index = number; ++ max = src0[number]; ++ } ++ } ++ target[0] = (unsigned int)index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#ifdef LV_HAVE_SSE ++#include ++ ++static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ float* inputPtr = (float*)src0; ++ ++ __m128 indexIncrementValues = _mm_set1_ps(4); ++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); ++ ++ float max = src0[0]; ++ float index = 0; ++ __m128 maxValues = _mm_set1_ps(max); ++ __m128 maxValuesIndex = _mm_setzero_ps(); ++ __m128 compareResults; ++ __m128 currentValues; ++ ++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++ ++ for(;number < quarterPoints; number++){ ++ ++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4; ++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++ ++ compareResults = _mm_cmpgt_ps(maxValues, currentValues); ++ ++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); ++ ++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); ++ } ++ ++ // Calculate the largest value from the remaining 4 points ++ _mm_store_ps(maxValuesBuffer, maxValues); ++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); ++ ++ for(number = 0; number < 4; number++){ ++ if(maxValuesBuffer[number] > max){ ++ index = maxIndexesBuffer[number]; ++ max = maxValuesBuffer[number]; ++ } ++ } ++ ++ number = quarterPoints * 4; ++ for(;number < num_points; number++){ ++ if(src0[number] > max){ ++ index = number; ++ max = src0[number]; ++ } ++ } ++ target[0] = (unsigned int)index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE*/ ++ ++#ifdef LV_HAVE_GENERIC ++static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ float max = src0[0]; ++ unsigned int index = 0; ++ ++ unsigned int i = 1; ++ ++ for(; i < num_points; ++i) { ++ ++ if(src0[i] > max){ ++ index = i; ++ max = src0[i]; ++ } ++ ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++ ++#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,302 @@ ++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H ++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include ++ /*! ++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param scalar The value multiplied against each point in the input buffer ++ \param num_points The number of data values to be converted ++ \note Input buffer does NOT need to be properly aligned ++ */ ++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ ++ const unsigned int eighthPoints = num_points / 8; ++ ++ const float* inputVectorPtr = (const float*)inputVector; ++ int16_t* outputVectorPtr = outputVector; ++ ++ float min_val = -32768; ++ float max_val = 32767; ++ float r; ++ ++ __m128 vScalar = _mm_set_ps1(scalar); ++ __m128 inputVal1, inputVal2; ++ __m128i intInputVal1, intInputVal2; ++ __m128 ret1, ret2; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(;number < eighthPoints; number++){ ++ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++ ++ // Scale and clip ++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ ++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++ outputVectorPtr += 8; ++ } ++ ++ number = eighthPoints * 8; ++ for(; number < num_points; number++){ ++ r = inputVector[number] * scalar; ++ if(r > max_val) ++ r = max_val; ++ else if(r < min_val) ++ r = min_val; ++ outputVector[number] = (int16_t)rintf(r); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_SSE ++#include ++ /*! ++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param scalar The value multiplied against each point in the input buffer ++ \param num_points The number of data values to be converted ++ \note Input buffer does NOT need to be properly aligned ++ */ ++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ ++ const unsigned int quarterPoints = num_points / 4; ++ ++ const float* inputVectorPtr = (const float*)inputVector; ++ int16_t* outputVectorPtr = outputVector; ++ ++ float min_val = -32768; ++ float max_val = 32767; ++ float r; ++ ++ __m128 vScalar = _mm_set_ps1(scalar); ++ __m128 ret; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++ ++ for(;number < quarterPoints; number++){ ++ ret = _mm_loadu_ps(inputVectorPtr); ++ inputVectorPtr += 4; ++ ++ // Scale and clip ++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++ ++ _mm_store_ps(outputFloatBuffer, ret); ++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++ } ++ ++ number = quarterPoints * 4; ++ for(; number < num_points; number++){ ++ r = inputVector[number] * scalar; ++ if(r > max_val) ++ r = max_val; ++ else if(r < min_val) ++ r = min_val; ++ outputVector[number] = (int16_t)rintf(r); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param scalar The value multiplied against each point in the input buffer ++ \param num_points The number of data values to be converted ++ \note Input buffer does NOT need to be properly aligned ++ */ ++static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++ int16_t* outputVectorPtr = outputVector; ++ const float* inputVectorPtr = inputVector; ++ unsigned int number = 0; ++ float min_val = -32768; ++ float max_val = 32767; ++ float r; ++ ++ for(number = 0; number < num_points; number++){ ++ r = *inputVectorPtr++ * scalar; ++ if(r > max_val) ++ r = max_val; ++ else if(r < min_val) ++ r = min_val; ++ *outputVectorPtr++ = (int16_t)rintf(r); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */ ++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H ++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include ++ /*! ++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param scalar The value multiplied against each point in the input buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ ++ const unsigned int eighthPoints = num_points / 8; ++ ++ const float* inputVectorPtr = (const float*)inputVector; ++ int16_t* outputVectorPtr = outputVector; ++ ++ float min_val = -32768; ++ float max_val = 32767; ++ float r; ++ ++ __m128 vScalar = _mm_set_ps1(scalar); ++ __m128 inputVal1, inputVal2; ++ __m128i intInputVal1, intInputVal2; ++ __m128 ret1, ret2; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(;number < eighthPoints; number++){ ++ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++ ++ // Scale and clip ++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ ++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++ outputVectorPtr += 8; ++ } ++ ++ number = eighthPoints * 8; ++ for(; number < num_points; number++){ ++ r = inputVector[number] * scalar; ++ if(r > max_val) ++ r = max_val; ++ else if(r < min_val) ++ r = min_val; ++ outputVector[number] = (int16_t)rintf(r); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_SSE ++#include ++ /*! ++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param scalar The value multiplied against each point in the input buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ ++ const unsigned int quarterPoints = num_points / 4; ++ ++ const float* inputVectorPtr = (const float*)inputVector; ++ int16_t* outputVectorPtr = outputVector; ++ ++ float min_val = -32768; ++ float max_val = 32767; ++ float r; ++ ++ __m128 vScalar = _mm_set_ps1(scalar); ++ __m128 ret; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++ ++ for(;number < quarterPoints; number++){ ++ ret = _mm_load_ps(inputVectorPtr); ++ inputVectorPtr += 4; ++ ++ // Scale and clip ++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++ ++ _mm_store_ps(outputFloatBuffer, ret); ++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++ } ++ ++ number = quarterPoints * 4; ++ for(; number < num_points; number++){ ++ r = inputVector[number] * scalar; ++ if(r > max_val) ++ r = max_val; ++ else if(r < min_val) ++ r = min_val; ++ outputVector[number] = (int16_t)rintf(r); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param scalar The value multiplied against each point in the input buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++ int16_t* outputVectorPtr = outputVector; ++ const float* inputVectorPtr = inputVector; ++ unsigned int number = 0; ++ float min_val = -32768; ++ float max_val = 32767; ++ float r; ++ ++ for(number = 0; number < num_points; number++){ ++ r = *inputVectorPtr++ * scalar; ++ if(r < min_val) ++ r = min_val; ++ else if(r > max_val) ++ r = max_val; ++ *outputVectorPtr++ = (int16_t)rintf(r); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,147 @@ ++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H ++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H ++ ++#include ++#include ++ ++#ifdef LV_HAVE_SSE ++#include ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++*/ ++static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ float* cPtr = cVector; ++ const float* aPtr = aVector; ++ const float* bPtr= bVector; ++ ++ __m128 aVal, bVal, cVal; ++ for(;number < quarterPoints; number++){ ++ ++ aVal = _mm_loadu_ps(aPtr); ++ bVal = _mm_loadu_ps(bPtr); ++ ++ cVal = _mm_add_ps(aVal, bVal); ++ ++ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container ++ ++ aPtr += 4; ++ bPtr += 4; ++ cPtr += 4; ++ } ++ ++ number = quarterPoints * 4; ++ for(;number < num_points; number++){ ++ *cPtr++ = (*aPtr++) + (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++*/ ++static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++ float* cPtr = cVector; ++ const float* aPtr = aVector; ++ const float* bPtr= bVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) + (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */ ++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H ++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H ++ ++#include ++#include ++ ++#ifdef LV_HAVE_SSE ++#include ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++*/ ++static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ float* cPtr = cVector; ++ const float* aPtr = aVector; ++ const float* bPtr= bVector; ++ ++ __m128 aVal, bVal, cVal; ++ for(;number < quarterPoints; number++){ ++ ++ aVal = _mm_load_ps(aPtr); ++ bVal = _mm_load_ps(bPtr); ++ ++ cVal = _mm_add_ps(aVal, bVal); ++ ++ _mm_store_ps(cPtr,cVal); // Store the results back into the C container ++ ++ aPtr += 4; ++ bPtr += 4; ++ cPtr += 4; ++ } ++ ++ number = quarterPoints * 4; ++ for(;number < num_points; number++){ ++ *cPtr++ = (*aPtr++) + (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++*/ ++static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++ float* cPtr = cVector; ++ const float* aPtr = aVector; ++ const float* bPtr= bVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) + (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++*/ ++extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); ++static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++ volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); ++} ++#endif /* LV_HAVE_ORC */ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,127 @@ ++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H ++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++ /*! ++ \brief Takes the conjugate of a complex vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x; ++ lv_32fc_t* c = cVector; ++ const lv_32fc_t* a = aVector; ++ ++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); ++ ++ for(;number < halfPoints; number++){ ++ ++ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi ++ ++ x = _mm_xor_ps(x, conjugator); // conjugate register ++ ++ _mm_storeu_ps((float*)c,x); // Store the results back into the C container ++ ++ a += 2; ++ c += 2; ++ } ++ ++ if((num_points % 2) != 0) { ++ *c = lv_conj(*a); ++ } ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Takes the conjugate of a complex vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = lv_conj(*aPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */ ++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H ++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++ /*! ++ \brief Takes the conjugate of a complex vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x; ++ lv_32fc_t* c = cVector; ++ const lv_32fc_t* a = aVector; ++ ++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); ++ ++ for(;number < halfPoints; number++){ ++ ++ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi ++ ++ x = _mm_xor_ps(x, conjugator); // conjugate register ++ ++ _mm_store_ps((float*)c,x); // Store the results back into the C container ++ ++ a += 2; ++ c += 2; ++ } ++ ++ if((num_points % 2) != 0) { ++ *c = lv_conj(*a); ++ } ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Takes the conjugate of a complex vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = lv_conj(*aPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,295 @@ ++/*! ++ * \file volk_gnsssdr_32fc_convert_16ic.h ++ * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H ++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points/4; ++ ++ float* inputVectorPtr = (float*)inputVector; ++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++ ++ float min_val = -32768; ++ float max_val = 32767; ++ ++ __m128 inputVal1, inputVal2; ++ __m128i intInputVal1, intInputVal2; ++ __m128 ret1, ret2; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(unsigned int i = 0;i < sse_iters; i++){ ++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ ++ // Clip ++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ ++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++ outputVectorPtr += 8; ++ } ++ ++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_SSE ++#include ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points/4; ++ ++ float* inputVectorPtr = (float*)inputVector; ++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++ ++ float min_val = -32768; ++ float max_val = 32767; ++ ++ __m128 inputVal1, inputVal2; ++ __m128i intInputVal1, intInputVal2; ++ __m128 ret1, ret2; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(unsigned int i = 0;i < sse_iters; i++){ ++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ ++ // Clip ++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ ++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++ outputVectorPtr += 8; ++ } ++ ++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ float* inputVectorPtr = (float*)inputVector; ++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++ float min_val = -32768; ++ float max_val = 32767; ++ ++ for(unsigned int i = 0; i < num_points*2; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H ++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points/4; ++ ++ float* inputVectorPtr = (float*)inputVector; ++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++ ++ float min_val = -32768; ++ float max_val = 32767; ++ ++ __m128 inputVal1, inputVal2; ++ __m128i intInputVal1, intInputVal2; ++ __m128 ret1, ret2; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(unsigned int i = 0;i < sse_iters; i++){ ++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ ++ // Clip ++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ ++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++ outputVectorPtr += 8; ++ } ++ ++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_SSE ++#include ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points/4; ++ ++ float* inputVectorPtr = (float*)inputVector; ++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++ ++ float min_val = -32768; ++ float max_val = 32767; ++ ++ __m128 inputVal1, inputVal2; ++ __m128i intInputVal1, intInputVal2; ++ __m128 ret1, ret2; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(unsigned int i = 0;i < sse_iters; i++){ ++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ ++ // Clip ++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ ++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++ outputVectorPtr += 8; ++ } ++ ++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ float* inputVectorPtr = (float*)inputVector; ++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++ float min_val = -32768; ++ float max_val = 32767; ++ ++ for(unsigned int i = 0; i < num_points*2; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,213 @@ ++/*! ++ * \file volk_gnsssdr_32fc_convert_8ic.h ++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H ++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points/8; ++ ++ float* inputVectorPtr = (float*)inputVector; ++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++ ++ float min_val = -128; ++ float max_val = 127; ++ ++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++ __m128i int8InputVal; ++ __m128 ret1, ret2, ret3, ret4; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(unsigned int i = 0;i < sse_iters; i++){ ++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ ++ // Clip ++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ intInputVal3 = _mm_cvtps_epi32(ret3); ++ intInputVal4 = _mm_cvtps_epi32(ret4); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); ++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); ++ ++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); ++ outputVectorPtr += 16; ++ } ++ ++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ float* inputVectorPtr = (float*)inputVector; ++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++ float min_val = -128; ++ float max_val = 127; ++ ++ for(unsigned int i = 0; i < num_points*2; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H ++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points/8; ++ ++ float* inputVectorPtr = (float*)inputVector; ++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++ ++ float min_val = -128; ++ float max_val = 127; ++ ++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++ __m128i int8InputVal; ++ __m128 ret1, ret2, ret3, ret4; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(unsigned int i = 0;i < sse_iters; i++){ ++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ ++ // Clip ++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ intInputVal3 = _mm_cvtps_epi32(ret3); ++ intInputVal4 = _mm_cvtps_epi32(ret4); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); ++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); ++ ++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); ++ outputVectorPtr += 16; ++ } ++ ++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++ float* inputVectorPtr = (float*)inputVector; ++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++ float min_val = -128; ++ float max_val = 127; ++ ++ for(unsigned int i = 0; i < num_points*2; i++){ ++ if(inputVectorPtr[i] > max_val) ++ inputVectorPtr[i] = max_val; ++ else if(inputVectorPtr[i] < min_val) ++ inputVectorPtr[i] = min_val; ++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,228 @@ ++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H ++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++ /*! ++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ const float* complexVectorPtr = (float*)complexVector; ++ float* magnitudeVectorPtr = magnitudeVector; ++ ++ __m128 cplxValue1, cplxValue2, result; ++ for(;number < quarterPoints; number++){ ++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++ complexVectorPtr += 4; ++ ++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++ complexVectorPtr += 4; ++ ++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++ _mm_storeu_ps(magnitudeVectorPtr, result); ++ magnitudeVectorPtr += 4; ++ } ++ ++ number = quarterPoints * 4; ++ for(; number < num_points; number++){ ++ float val1Real = *complexVectorPtr++; ++ float val1Imag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++ } ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_SSE ++#include ++ /*! ++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ const float* complexVectorPtr = (float*)complexVector; ++ float* magnitudeVectorPtr = magnitudeVector; ++ ++ __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++ for(;number < quarterPoints; number++){ ++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++ complexVectorPtr += 4; ++ ++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++ complexVectorPtr += 4; ++ ++ // Arrange in i1i2i3i4 format ++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++ // Arrange in q1q2q3q4 format ++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++ ++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++ ++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++ ++ _mm_storeu_ps(magnitudeVectorPtr, result); ++ magnitudeVectorPtr += 4; ++ } ++ ++ number = quarterPoints * 4; ++ for(; number < num_points; number++){ ++ float val1Real = *complexVectorPtr++; ++ float val1Imag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++ const float* complexVectorPtr = (float*)complexVector; ++ float* magnitudeVectorPtr = magnitudeVector; ++ unsigned int number = 0; ++ for(number = 0; number < num_points; number++){ ++ const float real = *complexVectorPtr++; ++ const float imag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ ++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H ++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++ /*! ++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ const float* complexVectorPtr = (float*)complexVector; ++ float* magnitudeVectorPtr = magnitudeVector; ++ ++ __m128 cplxValue1, cplxValue2, result; ++ for(;number < quarterPoints; number++){ ++ cplxValue1 = _mm_load_ps(complexVectorPtr); ++ complexVectorPtr += 4; ++ ++ cplxValue2 = _mm_load_ps(complexVectorPtr); ++ complexVectorPtr += 4; ++ ++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++ ++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++ ++ _mm_store_ps(magnitudeVectorPtr, result); ++ magnitudeVectorPtr += 4; ++ } ++ ++ number = quarterPoints * 4; ++ for(; number < num_points; number++){ ++ float val1Real = *complexVectorPtr++; ++ float val1Imag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++ } ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_SSE ++#include ++ /*! ++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int quarterPoints = num_points / 4; ++ ++ const float* complexVectorPtr = (float*)complexVector; ++ float* magnitudeVectorPtr = magnitudeVector; ++ ++ __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++ for(;number < quarterPoints; number++){ ++ cplxValue1 = _mm_load_ps(complexVectorPtr); ++ complexVectorPtr += 4; ++ ++ cplxValue2 = _mm_load_ps(complexVectorPtr); ++ complexVectorPtr += 4; ++ ++ // Arrange in i1i2i3i4 format ++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++ // Arrange in q1q2q3q4 format ++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++ ++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++ ++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++ ++ _mm_store_ps(magnitudeVectorPtr, result); ++ magnitudeVectorPtr += 4; ++ } ++ ++ number = quarterPoints * 4; ++ for(; number < num_points; number++){ ++ float val1Real = *complexVectorPtr++; ++ float val1Imag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++ const float* complexVectorPtr = (float*)complexVector; ++ float* magnitudeVectorPtr = magnitudeVector; ++ unsigned int number = 0; ++ for(number = 0; number < num_points; number++){ ++ const float real = *complexVectorPtr++; ++ const float imag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,231 @@ ++/*! ++ * \file volk_gnsssdr_32fc_s32f_convert_8ic.h ++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H ++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ ++ const unsigned int sse_iters = num_points/8; ++ ++ float* inputVectorPtr = (float*)inputVector; ++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++ ++ float min_val = -128; ++ float max_val = 127; ++ ++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++ __m128i int8InputVal; ++ __m128 ret1, ret2, ret3, ret4; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(unsigned int i = 0;i < sse_iters; i++){ ++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ ++ inputVal1 = _mm_mul_ps(inputVal1, invScalar); ++ inputVal2 = _mm_mul_ps(inputVal2, invScalar); ++ inputVal3 = _mm_mul_ps(inputVal3, invScalar); ++ inputVal4 = _mm_mul_ps(inputVal4, invScalar); ++ // Clip ++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ intInputVal3 = _mm_cvtps_epi32(ret3); ++ intInputVal4 = _mm_cvtps_epi32(ret4); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); ++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); ++ ++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); ++ outputVectorPtr += 16; ++ } ++ ++ float scaled = 0; ++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ ++ scaled = inputVectorPtr[i]/scalar; ++ if(scaled > max_val) ++ scaled = max_val; ++ else if(scaled < min_val) ++ scaled = min_val; ++ outputVectorPtr[i] = (int8_t)rintf(scaled); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ ++ float* inputVectorPtr = (float*)inputVector; ++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++ float scaled = 0; ++ float min_val = -128; ++ float max_val = 127; ++ ++ for(unsigned int i = 0; i < num_points*2; i++){ ++ scaled = (inputVectorPtr[i])/scalar; ++ if(scaled > max_val) ++ scaled = max_val; ++ else if(scaled < min_val) ++ scaled = min_val; ++ outputVectorPtr[i] = (int8_t)rintf(scaled); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H ++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ ++ const unsigned int sse_iters = num_points/8; ++ ++ float* inputVectorPtr = (float*)inputVector; ++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++ ++ float min_val = -128; ++ float max_val = 127; ++ ++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++ __m128i int8InputVal; ++ __m128 ret1, ret2, ret3, ret4; ++ __m128 vmin_val = _mm_set_ps1(min_val); ++ __m128 vmax_val = _mm_set_ps1(max_val); ++ ++ for(unsigned int i = 0;i < sse_iters; i++){ ++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++ ++ inputVal1 = _mm_mul_ps(inputVal1, invScalar); ++ inputVal2 = _mm_mul_ps(inputVal2, invScalar); ++ inputVal3 = _mm_mul_ps(inputVal3, invScalar); ++ inputVal4 = _mm_mul_ps(inputVal4, invScalar); ++ // Clip ++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ++ ++ intInputVal1 = _mm_cvtps_epi32(ret1); ++ intInputVal2 = _mm_cvtps_epi32(ret2); ++ intInputVal3 = _mm_cvtps_epi32(ret3); ++ intInputVal4 = _mm_cvtps_epi32(ret4); ++ ++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); ++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); ++ ++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); ++ outputVectorPtr += 16; ++ } ++ ++ float scaled = 0; ++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ ++ scaled = inputVectorPtr[i]/scalar; ++ if(scaled > max_val) ++ scaled = max_val; ++ else if(scaled < min_val) ++ scaled = min_val; ++ outputVectorPtr[i] = (int8_t)rintf(scaled); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++ \param inputVector The floating point input data buffer ++ \param outputVector The 16 bit output data buffer ++ \param num_points The number of data values to be converted ++ */ ++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ ++ float* inputVectorPtr = (float*)inputVector; ++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++ float scaled = 0; ++ float min_val = -128; ++ float max_val = 127; ++ ++ for(unsigned int i = 0; i < num_points*2; i++){ ++ scaled = inputVectorPtr[i]/scalar; ++ if(scaled > max_val) ++ scaled = max_val; ++ else if(scaled < min_val) ++ scaled = min_val; ++ outputVectorPtr[i] = (int8_t)rintf(scaled); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,266 @@ ++/*! ++ * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc ++ * \brief Volk protokernel: replaces the tracking function for update_local_code ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that replaces the tracking function for update_local_code ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H ++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++ /*! ++ \brief Takes the conjugate of a complex vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ ++ ++// float* pointer1 = (float*)&d_very_early_late_spc_chips; ++// *pointer1 = 1; ++// float* pointer2 = (float*)&code_length_half_chips; ++// *pointer2 = 6; ++// float* pointer3 = (float*)&code_phase_step_half_chips; ++// *pointer3 = 7; ++// float* pointer4 = (float*)&tcode_half_chips_input; ++// *pointer4 = 8; ++ ++ const unsigned int sse_iters = num_points / 4; ++ ++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; ++ ++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); ++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); ++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); ++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); ++ __m128 twos = _mm_set1_ps (2); ++ __m128i associated_chip_index_array_int; ++ ++ __VOLK_ATTR_ALIGNED(16) int32_t output[4]; ++ ++ for (unsigned int i = 0; i < sse_iters; i++) ++ { ++ //fmod = numer - tquot * denom; tquot = numer/denom truncated ++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); ++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); ++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); ++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); ++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); ++ ++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); ++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); ++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); ++ _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int); ++ ++ //d_very_early_code[i] = d_ca_code[associated_chip_index]; ++ *d_very_early_code++ = d_ca_code[output[0]]; ++ *d_very_early_code++ = d_ca_code[output[1]]; ++ *d_very_early_code++ = d_ca_code[output[2]]; ++ *d_very_early_code++ = d_ca_code[output[3]]; ++ ++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); ++ } ++ ++ if (num_points%4!=0) ++ { ++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; ++ _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); ++ ++ int associated_chip_index; ++ float tcode_half_chips = tcode_half_chips_stored[0]; ++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; ++ ++ for (unsigned int i = 0; i < num_points%4; i++) ++ { ++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); ++ d_very_early_code[i] = d_ca_code[associated_chip_index]; ++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++ } ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Takes the conjugate of a complex vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ ++ ++ float* pointer1 = (float*)&d_very_early_late_spc_chips; ++ *pointer1 = 1; ++ float* pointer2 = (float*)&code_length_half_chips; ++ *pointer2 = 6; ++ float* pointer3 = (float*)&code_phase_step_half_chips; ++ *pointer3 = 7; ++ float* pointer4 = (float*)&tcode_half_chips_input; ++ *pointer4 = 8; ++ ++ int associated_chip_index; ++ float tcode_half_chips = tcode_half_chips_input; ++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; ++ ++ for (unsigned int i = 0; i < num_points; i++) ++ { ++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); ++ d_very_early_code[i] = d_ca_code[associated_chip_index]; ++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H */ ++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H ++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++ /*! ++ \brief Takes the conjugate of a complex vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ ++ ++ // float* pointer1 = (float*)&d_very_early_late_spc_chips; ++ // *pointer1 = 1; ++ // float* pointer2 = (float*)&code_length_half_chips; ++ // *pointer2 = 6; ++ // float* pointer3 = (float*)&code_phase_step_half_chips; ++ // *pointer3 = 7; ++ // float* pointer4 = (float*)&tcode_half_chips_input; ++ // *pointer4 = 8; ++ ++ const unsigned int sse_iters = num_points / 4; ++ ++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; ++ ++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); ++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); ++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); ++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); ++ __m128 twos = _mm_set1_ps (2); ++ __m128i associated_chip_index_array_int; ++ ++ __VOLK_ATTR_ALIGNED(16) int32_t output[4]; ++ ++ for (unsigned int i = 0; i < sse_iters; i++) ++ { ++ //fmod = numer - tquot * denom; tquot = numer/denom truncated ++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); ++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); ++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); ++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); ++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); ++ ++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); ++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); ++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); ++ _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int); ++ ++ //d_very_early_code[i] = d_ca_code[associated_chip_index]; ++ *d_very_early_code++ = d_ca_code[output[0]]; ++ *d_very_early_code++ = d_ca_code[output[1]]; ++ *d_very_early_code++ = d_ca_code[output[2]]; ++ *d_very_early_code++ = d_ca_code[output[3]]; ++ ++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); ++ } ++ ++ if (num_points%4!=0) ++ { ++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; ++ _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); ++ ++ int associated_chip_index; ++ float tcode_half_chips = tcode_half_chips_stored[0]; ++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; ++ ++ for (unsigned int i = 0; i < num_points%4; i++) ++ { ++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); ++ d_very_early_code[i] = d_ca_code[associated_chip_index]; ++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++ } ++ } ++ ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Takes the conjugate of a complex vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ ++ ++ // float* pointer1 = (float*)&d_very_early_late_spc_chips; ++ // *pointer1 = 1; ++ // float* pointer2 = (float*)&code_length_half_chips; ++ // *pointer2 = 6; ++ // float* pointer3 = (float*)&code_phase_step_half_chips; ++ // *pointer3 = 7; ++ // float* pointer4 = (float*)&tcode_half_chips_input; ++ // *pointer4 = 8; ++ ++ int associated_chip_index; ++ float tcode_half_chips = tcode_half_chips_input; ++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; ++ ++ for (unsigned int i = 0; i < num_points; i++) ++ { ++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); ++ d_very_early_code[i] = d_ca_code[associated_chip_index]; ++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,178 @@ ++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H ++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector The vector to be multiplied ++ \param scalar The complex scalar to multiply aVector ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++*/ ++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x, yl, yh, z, tmp1, tmp2; ++ lv_32fc_t* c = cVector; ++ const lv_32fc_t* a = aVector; ++ ++ // Set up constant scalar vector ++ yl = _mm_set_ps1(lv_creal(scalar)); ++ yh = _mm_set_ps1(lv_cimag(scalar)); ++ ++ for(;number < halfPoints; number++){ ++ ++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container ++ ++ a += 2; ++ c += 2; ++ } ++ ++ if((num_points % 2) != 0) { ++ *c = (*a) * scalar; ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector The vector to be multiplied ++ \param scalar The complex scalar to multiply aVector ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++*/ ++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ unsigned int number = num_points; ++ ++ // unwrap loop ++ while (number >= 8){ ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ number -= 8; ++ } ++ ++ // clean up any remaining ++ while (number-- > 0) ++ *cPtr++ = *aPtr++ * scalar; ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ ++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H ++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++ /*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x, yl, yh, z, tmp1, tmp2; ++ lv_32fc_t* c = cVector; ++ const lv_32fc_t* a = aVector; ++ ++ // Set up constant scalar vector ++ yl = _mm_set_ps1(lv_creal(scalar)); ++ yh = _mm_set_ps1(lv_cimag(scalar)); ++ ++ for(;number < halfPoints; number++){ ++ ++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ _mm_store_ps((float*)c,z); // Store the results back into the C container ++ ++ a += 2; ++ c += 2; ++ } ++ ++ if((num_points % 2) != 0) { ++ *c = (*a) * scalar; ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ unsigned int number = num_points; ++ ++ // unwrap loop ++ while (number >= 8){ ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ number -= 8; ++ } ++ ++ // clean up any remaining ++ while (number-- > 0) ++ *cPtr++ = *aPtr++ * scalar; ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,763 @@ ++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H ++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H ++ ++#include ++#include ++#include ++#include ++ ++ ++#ifdef LV_HAVE_GENERIC ++ ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ float * res = (float*) result; ++ float * in = (float*) input; ++ float * tp = (float*) taps; ++ unsigned int n_2_ccomplex_blocks = num_points/2; ++ unsigned int isodd = num_points & 1; ++ ++ float sum0[2] = {0,0}; ++ float sum1[2] = {0,0}; ++ unsigned int i = 0; ++ ++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++ ++ in += 4; ++ tp += 4; ++ } ++ ++ res[0] = sum0[0] + sum1[0]; ++ res[1] = sum0[1] + sum1[1]; ++ ++ // Cleanup if we had an odd number of points ++ for(i = 0; i < isodd; ++i) { ++ *result += input[num_points - 1] * taps[num_points - 1]; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++ ++ ++#if LV_HAVE_SSE && LV_HAVE_64 ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ const unsigned int num_bytes = num_points*8; ++ unsigned int isodd = num_points & 1; ++ ++ asm ++ ( ++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" ++ "# const float *taps, unsigned num_bytes)\n\t" ++ "# float sum0 = 0;\n\t" ++ "# float sum1 = 0;\n\t" ++ "# float sum2 = 0;\n\t" ++ "# float sum3 = 0;\n\t" ++ "# do {\n\t" ++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" ++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" ++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" ++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" ++ "# input += 4;\n\t" ++ "# taps += 4; \n\t" ++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" ++ "# result[0] = sum0 + sum2;\n\t" ++ "# result[1] = sum1 + sum3;\n\t" ++ "# TODO: prefetch and better scheduling\n\t" ++ " xor %%r9, %%r9\n\t" ++ " xor %%r10, %%r10\n\t" ++ " movq %%rcx, %%rax\n\t" ++ " movq %%rcx, %%r8\n\t" ++ " movq %[rsi], %%r9\n\t" ++ " movq %[rdx], %%r10\n\t" ++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++ " movups 0(%%r9), %%xmm0\n\t" ++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++ " movups 0(%%r10), %%xmm2\n\t" ++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" ++ " shr $4, %%r8\n\t" ++ " jmp .%=L1_test\n\t" ++ " # 4 taps / loop\n\t" ++ " # something like ?? cycles / loop\n\t" ++ ".%=Loop1: \n\t" ++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++ "# movups (%%r9), %%xmmA\n\t" ++ "# movups (%%r10), %%xmmB\n\t" ++ "# movups %%xmmA, %%xmmZ\n\t" ++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++ "# mulps %%xmmB, %%xmmA\n\t" ++ "# mulps %%xmmZ, %%xmmB\n\t" ++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++ "# xorps %%xmmPN, %%xmmA\n\t" ++ "# movups %%xmmA, %%xmmZ\n\t" ++ "# unpcklps %%xmmB, %%xmmA\n\t" ++ "# unpckhps %%xmmB, %%xmmZ\n\t" ++ "# movups %%xmmZ, %%xmmY\n\t" ++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++ "# addps %%xmmZ, %%xmmA\n\t" ++ "# addps %%xmmA, %%xmmC\n\t" ++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++ " movups 16(%%r9), %%xmm1\n\t" ++ " movups %%xmm0, %%xmm4\n\t" ++ " mulps %%xmm2, %%xmm0\n\t" ++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++ " movups 16(%%r10), %%xmm3\n\t" ++ " movups %%xmm1, %%xmm5\n\t" ++ " addps %%xmm0, %%xmm6\n\t" ++ " mulps %%xmm3, %%xmm1\n\t" ++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++ " addps %%xmm1, %%xmm6\n\t" ++ " mulps %%xmm4, %%xmm2\n\t" ++ " movups 32(%%r9), %%xmm0\n\t" ++ " addps %%xmm2, %%xmm7\n\t" ++ " mulps %%xmm5, %%xmm3\n\t" ++ " add $32, %%r9\n\t" ++ " movups 32(%%r10), %%xmm2\n\t" ++ " addps %%xmm3, %%xmm7\n\t" ++ " add $32, %%r10\n\t" ++ ".%=L1_test:\n\t" ++ " dec %%rax\n\t" ++ " jge .%=Loop1\n\t" ++ " # We've handled the bulk of multiplies up to here.\n\t" ++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++ " # If so, we've got 2 more taps to do.\n\t" ++ " and $1, %%r8\n\t" ++ " je .%=Leven\n\t" ++ " # The count was odd, do 2 more taps.\n\t" ++ " # Note that we've already got mm0/mm2 preloaded\n\t" ++ " # from the main loop.\n\t" ++ " movups %%xmm0, %%xmm4\n\t" ++ " mulps %%xmm2, %%xmm0\n\t" ++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++ " addps %%xmm0, %%xmm6\n\t" ++ " mulps %%xmm4, %%xmm2\n\t" ++ " addps %%xmm2, %%xmm7\n\t" ++ ".%=Leven:\n\t" ++ " # neg inversor\n\t" ++ " xorps %%xmm1, %%xmm1\n\t" ++ " mov $0x80000000, %%r9\n\t" ++ " movd %%r9, %%xmm1\n\t" ++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++ " # pfpnacc\n\t" ++ " xorps %%xmm1, %%xmm6\n\t" ++ " movups %%xmm6, %%xmm2\n\t" ++ " unpcklps %%xmm7, %%xmm6\n\t" ++ " unpckhps %%xmm7, %%xmm2\n\t" ++ " movups %%xmm2, %%xmm3\n\t" ++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++ " addps %%xmm2, %%xmm6\n\t" ++ " # xmm6 = r1 i2 r3 i4\n\t" ++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" ++ : ++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) ++ :"rax", "r8", "r9", "r10" ++ ); ++ ++ ++ if(isodd) { ++ *result += input[num_points - 1] * taps[num_points - 1]; ++ } ++ ++ return; ++ ++} ++ ++#endif /* LV_HAVE_SSE && LV_HAVE_64 */ ++ ++ ++ ++ ++#ifdef LV_HAVE_SSE3 ++ ++#include ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ lv_32fc_t dotProduct; ++ memset(&dotProduct, 0x0, 2*sizeof(float)); ++ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points/2; ++ unsigned int isodd = num_points & 1; ++ ++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++ ++ const lv_32fc_t* a = input; ++ const lv_32fc_t* b = taps; ++ ++ dotProdVal = _mm_setzero_ps(); ++ ++ for(;number < halfPoints; number++){ ++ ++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together ++ ++ a += 2; ++ b += 2; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; ++ ++ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector ++ ++ dotProduct += ( dotProductVector[0] + dotProductVector[1] ); ++ ++ if(isodd) { ++ dotProduct += input[num_points - 1] * taps[num_points - 1]; ++ } ++ ++ *result = dotProduct; ++} ++ ++#endif /*LV_HAVE_SSE3*/ ++ ++#ifdef LV_HAVE_SSE4_1 ++ ++#include ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ unsigned int i = 0; ++ const unsigned int qtr_points = num_points/4; ++ const unsigned int isodd = num_points & 3; ++ ++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; ++ float *p_input, *p_taps; ++ __m64 *p_result; ++ ++ p_result = (__m64*)result; ++ p_input = (float*)input; ++ p_taps = (float*)taps; ++ ++ static const __m128i neg = {0x000000000000000080000000}; ++ ++ real0 = _mm_setzero_ps(); ++ real1 = _mm_setzero_ps(); ++ im0 = _mm_setzero_ps(); ++ im1 = _mm_setzero_ps(); ++ ++ for(; i < qtr_points; ++i) { ++ xmm0 = _mm_loadu_ps(p_input); ++ xmm1 = _mm_loadu_ps(p_taps); ++ ++ p_input += 4; ++ p_taps += 4; ++ ++ xmm2 = _mm_loadu_ps(p_input); ++ xmm3 = _mm_loadu_ps(p_taps); ++ ++ p_input += 4; ++ p_taps += 4; ++ ++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2); ++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3); ++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2); ++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3); ++ ++ //imaginary vector from input ++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4); ++ //real vector from input ++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4); ++ //imaginary vector from taps ++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5); ++ //real vector from taps ++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5); ++ ++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); ++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); ++ ++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); ++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); ++ ++ real0 = _mm_add_ps(xmm4, real0); ++ real1 = _mm_add_ps(xmm5, real1); ++ im0 = _mm_add_ps(xmm6, im0); ++ im1 = _mm_add_ps(xmm7, im1); ++ } ++ ++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); ++ ++ im0 = _mm_add_ps(im0, im1); ++ real0 = _mm_add_ps(real0, real1); ++ ++ im0 = _mm_add_ps(im0, real0); ++ ++ _mm_storel_pi(p_result, im0); ++ ++ for(i = num_points-isodd; i < num_points; i++) { ++ *result += input[i] * taps[i]; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++ ++ ++ ++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/ ++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H ++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H ++ ++#include ++#include ++#include ++#include ++ ++ ++#ifdef LV_HAVE_GENERIC ++ ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ const unsigned int num_bytes = num_points*8; ++ ++ float * res = (float*) result; ++ float * in = (float*) input; ++ float * tp = (float*) taps; ++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4; ++ unsigned int isodd = num_points & 1; ++ ++ float sum0[2] = {0,0}; ++ float sum1[2] = {0,0}; ++ unsigned int i = 0; ++ ++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++ ++ in += 4; ++ tp += 4; ++ } ++ ++ res[0] = sum0[0] + sum1[0]; ++ res[1] = sum0[1] + sum1[1]; ++ ++ for(i = 0; i < isodd; ++i) { ++ *result += input[num_points - 1] * taps[num_points - 1]; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++ ++#if LV_HAVE_SSE && LV_HAVE_64 ++ ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ const unsigned int num_bytes = num_points*8; ++ unsigned int isodd = num_points & 1; ++ ++ asm ++ ( ++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" ++ "# const float *taps, unsigned num_bytes)\n\t" ++ "# float sum0 = 0;\n\t" ++ "# float sum1 = 0;\n\t" ++ "# float sum2 = 0;\n\t" ++ "# float sum3 = 0;\n\t" ++ "# do {\n\t" ++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" ++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" ++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" ++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" ++ "# input += 4;\n\t" ++ "# taps += 4; \n\t" ++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" ++ "# result[0] = sum0 + sum2;\n\t" ++ "# result[1] = sum1 + sum3;\n\t" ++ "# TODO: prefetch and better scheduling\n\t" ++ " xor %%r9, %%r9\n\t" ++ " xor %%r10, %%r10\n\t" ++ " movq %%rcx, %%rax\n\t" ++ " movq %%rcx, %%r8\n\t" ++ " movq %[rsi], %%r9\n\t" ++ " movq %[rdx], %%r10\n\t" ++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++ " movaps 0(%%r9), %%xmm0\n\t" ++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++ " movaps 0(%%r10), %%xmm2\n\t" ++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" ++ " shr $4, %%r8\n\t" ++ " jmp .%=L1_test\n\t" ++ " # 4 taps / loop\n\t" ++ " # something like ?? cycles / loop\n\t" ++ ".%=Loop1: \n\t" ++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++ "# movaps (%%r9), %%xmmA\n\t" ++ "# movaps (%%r10), %%xmmB\n\t" ++ "# movaps %%xmmA, %%xmmZ\n\t" ++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++ "# mulps %%xmmB, %%xmmA\n\t" ++ "# mulps %%xmmZ, %%xmmB\n\t" ++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++ "# xorps %%xmmPN, %%xmmA\n\t" ++ "# movaps %%xmmA, %%xmmZ\n\t" ++ "# unpcklps %%xmmB, %%xmmA\n\t" ++ "# unpckhps %%xmmB, %%xmmZ\n\t" ++ "# movaps %%xmmZ, %%xmmY\n\t" ++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++ "# addps %%xmmZ, %%xmmA\n\t" ++ "# addps %%xmmA, %%xmmC\n\t" ++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++ " movaps 16(%%r9), %%xmm1\n\t" ++ " movaps %%xmm0, %%xmm4\n\t" ++ " mulps %%xmm2, %%xmm0\n\t" ++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++ " movaps 16(%%r10), %%xmm3\n\t" ++ " movaps %%xmm1, %%xmm5\n\t" ++ " addps %%xmm0, %%xmm6\n\t" ++ " mulps %%xmm3, %%xmm1\n\t" ++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++ " addps %%xmm1, %%xmm6\n\t" ++ " mulps %%xmm4, %%xmm2\n\t" ++ " movaps 32(%%r9), %%xmm0\n\t" ++ " addps %%xmm2, %%xmm7\n\t" ++ " mulps %%xmm5, %%xmm3\n\t" ++ " add $32, %%r9\n\t" ++ " movaps 32(%%r10), %%xmm2\n\t" ++ " addps %%xmm3, %%xmm7\n\t" ++ " add $32, %%r10\n\t" ++ ".%=L1_test:\n\t" ++ " dec %%rax\n\t" ++ " jge .%=Loop1\n\t" ++ " # We've handled the bulk of multiplies up to here.\n\t" ++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++ " # If so, we've got 2 more taps to do.\n\t" ++ " and $1, %%r8\n\t" ++ " je .%=Leven\n\t" ++ " # The count was odd, do 2 more taps.\n\t" ++ " # Note that we've already got mm0/mm2 preloaded\n\t" ++ " # from the main loop.\n\t" ++ " movaps %%xmm0, %%xmm4\n\t" ++ " mulps %%xmm2, %%xmm0\n\t" ++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++ " addps %%xmm0, %%xmm6\n\t" ++ " mulps %%xmm4, %%xmm2\n\t" ++ " addps %%xmm2, %%xmm7\n\t" ++ ".%=Leven:\n\t" ++ " # neg inversor\n\t" ++ " xorps %%xmm1, %%xmm1\n\t" ++ " mov $0x80000000, %%r9\n\t" ++ " movd %%r9, %%xmm1\n\t" ++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++ " # pfpnacc\n\t" ++ " xorps %%xmm1, %%xmm6\n\t" ++ " movaps %%xmm6, %%xmm2\n\t" ++ " unpcklps %%xmm7, %%xmm6\n\t" ++ " unpckhps %%xmm7, %%xmm2\n\t" ++ " movaps %%xmm2, %%xmm3\n\t" ++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++ " addps %%xmm2, %%xmm6\n\t" ++ " # xmm6 = r1 i2 r3 i4\n\t" ++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" ++ : ++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) ++ :"rax", "r8", "r9", "r10" ++ ); ++ ++ ++ if(isodd) { ++ *result += input[num_points - 1] * taps[num_points - 1]; ++ } ++ ++ return; ++ ++} ++ ++#endif ++ ++#if LV_HAVE_SSE && LV_HAVE_32 ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); ++ ++#if 0 ++ const unsigned int num_bytes = num_points*8; ++ unsigned int isodd = num_points & 1; ++ ++ asm volatile ++ ( ++ " #pushl %%ebp\n\t" ++ " #movl %%esp, %%ebp\n\t" ++ " movl 12(%%ebp), %%eax # input\n\t" ++ " movl 16(%%ebp), %%edx # taps\n\t" ++ " movl 20(%%ebp), %%ecx # n_bytes\n\t" ++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++ " movaps 0(%%eax), %%xmm0\n\t" ++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++ " movaps 0(%%edx), %%xmm2\n\t" ++ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" ++ " jmp .%=L1_test\n\t" ++ " # 4 taps / loop\n\t" ++ " # something like ?? cycles / loop\n\t" ++ ".%=Loop1: \n\t" ++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++ "# movaps (%%eax), %%xmmA\n\t" ++ "# movaps (%%edx), %%xmmB\n\t" ++ "# movaps %%xmmA, %%xmmZ\n\t" ++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++ "# mulps %%xmmB, %%xmmA\n\t" ++ "# mulps %%xmmZ, %%xmmB\n\t" ++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++ "# xorps %%xmmPN, %%xmmA\n\t" ++ "# movaps %%xmmA, %%xmmZ\n\t" ++ "# unpcklps %%xmmB, %%xmmA\n\t" ++ "# unpckhps %%xmmB, %%xmmZ\n\t" ++ "# movaps %%xmmZ, %%xmmY\n\t" ++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++ "# addps %%xmmZ, %%xmmA\n\t" ++ "# addps %%xmmA, %%xmmC\n\t" ++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++ " movaps 16(%%eax), %%xmm1\n\t" ++ " movaps %%xmm0, %%xmm4\n\t" ++ " mulps %%xmm2, %%xmm0\n\t" ++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++ " movaps 16(%%edx), %%xmm3\n\t" ++ " movaps %%xmm1, %%xmm5\n\t" ++ " addps %%xmm0, %%xmm6\n\t" ++ " mulps %%xmm3, %%xmm1\n\t" ++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++ " addps %%xmm1, %%xmm6\n\t" ++ " mulps %%xmm4, %%xmm2\n\t" ++ " movaps 32(%%eax), %%xmm0\n\t" ++ " addps %%xmm2, %%xmm7\n\t" ++ " mulps %%xmm5, %%xmm3\n\t" ++ " addl $32, %%eax\n\t" ++ " movaps 32(%%edx), %%xmm2\n\t" ++ " addps %%xmm3, %%xmm7\n\t" ++ " addl $32, %%edx\n\t" ++ ".%=L1_test:\n\t" ++ " decl %%ecx\n\t" ++ " jge .%=Loop1\n\t" ++ " # We've handled the bulk of multiplies up to here.\n\t" ++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++ " # If so, we've got 2 more taps to do.\n\t" ++ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" ++ " shrl $4, %%ecx\n\t" ++ " andl $1, %%ecx\n\t" ++ " je .%=Leven\n\t" ++ " # The count was odd, do 2 more taps.\n\t" ++ " # Note that we've already got mm0/mm2 preloaded\n\t" ++ " # from the main loop.\n\t" ++ " movaps %%xmm0, %%xmm4\n\t" ++ " mulps %%xmm2, %%xmm0\n\t" ++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++ " addps %%xmm0, %%xmm6\n\t" ++ " mulps %%xmm4, %%xmm2\n\t" ++ " addps %%xmm2, %%xmm7\n\t" ++ ".%=Leven:\n\t" ++ " # neg inversor\n\t" ++ " movl 8(%%ebp), %%eax \n\t" ++ " xorps %%xmm1, %%xmm1\n\t" ++ " movl $0x80000000, (%%eax)\n\t" ++ " movss (%%eax), %%xmm1\n\t" ++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++ " # pfpnacc\n\t" ++ " xorps %%xmm1, %%xmm6\n\t" ++ " movaps %%xmm6, %%xmm2\n\t" ++ " unpcklps %%xmm7, %%xmm6\n\t" ++ " unpckhps %%xmm7, %%xmm2\n\t" ++ " movaps %%xmm2, %%xmm3\n\t" ++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++ " addps %%xmm2, %%xmm6\n\t" ++ " # xmm6 = r1 i2 r3 i4\n\t" ++ " #movl 8(%%ebp), %%eax # @result\n\t" ++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" ++ " #popl %%ebp\n\t" ++ : ++ : ++ : "eax", "ecx", "edx" ++ ); ++ ++ ++ int getem = num_bytes % 16; ++ ++ if(isodd) { ++ *result += (input[num_points - 1] * taps[num_points - 1]); ++ } ++ ++ return; ++#endif ++} ++ ++#endif /*LV_HAVE_SSE*/ ++ ++#ifdef LV_HAVE_SSE3 ++ ++#include ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ const unsigned int num_bytes = num_points*8; ++ unsigned int isodd = num_points & 1; ++ ++ lv_32fc_t dotProduct; ++ memset(&dotProduct, 0x0, 2*sizeof(float)); ++ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_bytes >> 4; ++ ++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++ ++ const lv_32fc_t* a = input; ++ const lv_32fc_t* b = taps; ++ ++ dotProdVal = _mm_setzero_ps(); ++ ++ for(;number < halfPoints; number++){ ++ ++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together ++ ++ a += 2; ++ b += 2; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; ++ ++ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector ++ ++ dotProduct += ( dotProductVector[0] + dotProductVector[1] ); ++ ++ if(isodd) { ++ dotProduct += input[num_points - 1] * taps[num_points - 1]; ++ } ++ ++ *result = dotProduct; ++} ++ ++#endif /*LV_HAVE_SSE3*/ ++ ++#ifdef LV_HAVE_SSE4_1 ++ ++#include ++ ++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++ ++ unsigned int i = 0; ++ const unsigned int qtr_points = num_points/4; ++ const unsigned int isodd = num_points & 3; ++ ++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; ++ float *p_input, *p_taps; ++ __m64 *p_result; ++ ++ static const __m128i neg = {0x000000000000000080000000}; ++ ++ p_result = (__m64*)result; ++ p_input = (float*)input; ++ p_taps = (float*)taps; ++ ++ real0 = _mm_setzero_ps(); ++ real1 = _mm_setzero_ps(); ++ im0 = _mm_setzero_ps(); ++ im1 = _mm_setzero_ps(); ++ ++ for(; i < qtr_points; ++i) { ++ xmm0 = _mm_load_ps(p_input); ++ xmm1 = _mm_load_ps(p_taps); ++ ++ p_input += 4; ++ p_taps += 4; ++ ++ xmm2 = _mm_load_ps(p_input); ++ xmm3 = _mm_load_ps(p_taps); ++ ++ p_input += 4; ++ p_taps += 4; ++ ++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2); ++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3); ++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2); ++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3); ++ ++ //imaginary vector from input ++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4); ++ //real vector from input ++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4); ++ //imaginary vector from taps ++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5); ++ //real vector from taps ++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5); ++ ++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); ++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); ++ ++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); ++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); ++ ++ real0 = _mm_add_ps(xmm4, real0); ++ real1 = _mm_add_ps(xmm5, real1); ++ im0 = _mm_add_ps(xmm6, im0); ++ im1 = _mm_add_ps(xmm7, im1); ++ } ++ ++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); ++ ++ im0 = _mm_add_ps(im0, im1); ++ real0 = _mm_add_ps(real0, real1); ++ ++ im0 = _mm_add_ps(im0, real0); ++ ++ _mm_storel_pi(p_result, im0); ++ ++ for(i = num_points-isodd; i < num_points; i++) { ++ *result += input[i] * taps[i]; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,170 @@ ++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H ++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++ /*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x, y, yl, yh, z, tmp1, tmp2; ++ lv_32fc_t* c = cVector; ++ const lv_32fc_t* a = aVector; ++ const lv_32fc_t* b = bVector; ++ ++ for(;number < halfPoints; number++){ ++ ++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container ++ ++ a += 2; ++ b += 2; ++ c += 2; ++ } ++ ++ if((num_points % 2) != 0) { ++ *c = (*a) * (*b); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ const lv_32fc_t* bPtr= bVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) * (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ ++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H ++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++ /*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ __m128 x, y, yl, yh, z, tmp1, tmp2; ++ lv_32fc_t* c = cVector; ++ const lv_32fc_t* a = aVector; ++ const lv_32fc_t* b = bVector; ++ for(;number < halfPoints; number++){ ++ ++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ _mm_store_ps((float*)c,z); // Store the results back into the C container ++ ++ a += 2; ++ b += 2; ++ c += 2; ++ } ++ ++ if((num_points % 2) != 0) { ++ *c = (*a) * (*b); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++ /*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++ lv_32fc_t* cPtr = cVector; ++ const lv_32fc_t* aPtr = aVector; ++ const lv_32fc_t* bPtr= bVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) * (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++ /*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); ++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++ volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); ++} ++#endif /* LV_HAVE_ORC */ ++ ++ ++ ++ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,409 @@ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++/*! ++ * TODO: Code the SSE4 version and benchmark it ++ */ ++#ifdef LV_HAVE_SSE3 ++#include ++ ++ ++ /*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) ++{ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ lv_32fc_t dotProduct_E; ++ memset(&dotProduct_E, 0x0, 2*sizeof(float)); ++ lv_32fc_t dotProduct_P; ++ memset(&dotProduct_P, 0x0, 2*sizeof(float)); ++ lv_32fc_t dotProduct_L; ++ memset(&dotProduct_L, 0x0, 2*sizeof(float)); ++ ++ // Aux vars ++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; ++ ++ z_E = _mm_setzero_ps(); ++ z_P = _mm_setzero_ps(); ++ z_L = _mm_setzero_ps(); ++ ++ //input and output vectors ++ //lv_32fc_t* _input_BB = input_BB; ++ const lv_32fc_t* _input = input; ++ const lv_32fc_t* _carrier = carrier; ++ const lv_32fc_t* _E_code = E_code; ++ const lv_32fc_t* _P_code = P_code; ++ const lv_32fc_t* _L_code = L_code; ++ ++ for(;number < halfPoints; number++) ++ { ++ // carrier wipe-off (vector point-to-point product) ++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container ++ ++ // correlation E,P,L (3x vector scalar product) ++ // Early ++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++ x = z; ++ ++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together ++ ++ // Prompt ++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together ++ ++ // Late ++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together ++ ++ /*pointer increment*/ ++ _carrier += 2; ++ _input += 2; ++ //_input_BB += 2; ++ _E_code += 2; ++ _P_code += 2; ++ _L_code +=2; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; ++ ++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++ ++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); ++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); ++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); ++ ++ if((num_points % 2) != 0) ++ { ++ //_input_BB = (*_input) * (*_carrier); ++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); ++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); ++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); ++ } ++ ++ *E_out = dotProduct_E; ++ *P_out = dotProduct_P; ++ *L_out = dotProduct_L; ++} ++ ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) ++{ ++ lv_32fc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get early, late, and prompt values for each ++ *E_out += bb_signal_sample * E_code[i]; ++ *P_out += bb_signal_sample * P_code[i]; ++ *L_out += bb_signal_sample * L_code[i]; ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) ++{ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ lv_32fc_t dotProduct_E; ++ memset(&dotProduct_E, 0x0, 2*sizeof(float)); ++ lv_32fc_t dotProduct_P; ++ memset(&dotProduct_P, 0x0, 2*sizeof(float)); ++ lv_32fc_t dotProduct_L; ++ memset(&dotProduct_L, 0x0, 2*sizeof(float)); ++ ++ // Aux vars ++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; ++ ++ z_E = _mm_setzero_ps(); ++ z_P = _mm_setzero_ps(); ++ z_L = _mm_setzero_ps(); ++ ++ //input and output vectors ++ //lv_32fc_t* _input_BB = input_BB; ++ const lv_32fc_t* _input = input; ++ const lv_32fc_t* _carrier = carrier; ++ const lv_32fc_t* _E_code = E_code; ++ const lv_32fc_t* _P_code = P_code; ++ const lv_32fc_t* _L_code = L_code; ++ ++ for(;number < halfPoints; number++) ++ { ++ // carrier wipe-off (vector point-to-point product) ++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container ++ ++ // correlation E,P,L (3x vector scalar product) ++ // Early ++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++ x = z; ++ ++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together ++ ++ // Prompt ++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together ++ ++ // Late ++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ ++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together ++ ++ /*pointer increment*/ ++ _carrier += 2; ++ _input += 2; ++ //_input_BB += 2; ++ _E_code += 2; ++ _P_code += 2; ++ _L_code +=2; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; ++ ++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++ ++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); ++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); ++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); ++ ++ if((num_points % 2) != 0) ++ { ++ //_input_BB = (*_input) * (*_carrier); ++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); ++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); ++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); ++ } ++ ++ *E_out = dotProduct_E; ++ *P_out = dotProduct_P; ++ *L_out = dotProduct_L; ++} ++ ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) ++{ ++ lv_32fc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get early, late, and prompt values for each ++ *E_out += bb_signal_sample * E_code[i]; ++ *P_out += bb_signal_sample * P_code[i]; ++ *L_out += bb_signal_sample * L_code[i]; ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,848 @@ ++/*! ++ * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5 ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors ++ * \authors
    ++ *
  • Javier Arribas, 2011. jarribas(at)cttc.es ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * VE, Early, Prompt, Late and VL correlation with 64 bits vectors (32 bits the ++ * real part and 32 bits the imaginary part): ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 64 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - VE values are calculated by multiplying the input signal in BB by the ++ * VE code (multiplication of 64 bits vectors), accumulating the results ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 64 bits vectors), accumulating the results ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 64 bits vectors), accumulating the results ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 64 bits vectors), accumulating the results ++ * - VL values are calculated by multiplying the input signal in BB by the ++ * VL code (multiplication of 64 bits vectors), accumulating the results ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include ++/*! ++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code VE PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param VL_code VL PRN code replica input ++ \param VE_out VE correlation output ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param VL_out VL correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++{ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 4; ++ ++ lv_32fc_t dotProduct_VE; ++ lv_32fc_t dotProduct_E; ++ lv_32fc_t dotProduct_P; ++ lv_32fc_t dotProduct_L; ++ lv_32fc_t dotProduct_VL; ++ ++ // Aux vars ++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; ++ __m256 bb_signal_sample, bb_signal_sample_shuffled; ++ ++ z_VE = _mm256_setzero_ps(); ++ z_E = _mm256_setzero_ps(); ++ z_P = _mm256_setzero_ps(); ++ z_L = _mm256_setzero_ps(); ++ z_VL = _mm256_setzero_ps(); ++ ++ //input and output vectors ++ const lv_32fc_t* _input = input; ++ const lv_32fc_t* _carrier = carrier; ++ const lv_32fc_t* _VE_code = VE_code; ++ const lv_32fc_t* _E_code = E_code; ++ const lv_32fc_t* _P_code = P_code; ++ const lv_32fc_t* _L_code = L_code; ++ const lv_32fc_t* _VL_code = VL_code; ++ ++ for(;number < halfPoints; number++) ++ { ++ // carrier wipe-off (vector point-to-point product) ++ x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++ ++ // correlation VE,E,P,L,VL (5x vector scalar product) ++ // VE ++ y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together ++ ++ // Early ++ y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together ++ ++ // Prompt ++ y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together ++ ++ // Late ++ y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together ++ ++ // VL ++ y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together ++ ++ /*pointer increment*/ ++ _carrier += 4; ++ _input += 4; ++ _VE_code += 4; ++ _E_code += 4; ++ _P_code += 4; ++ _L_code += 4; ++ _VL_code += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; ++ ++ _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector ++ _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++ _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++ _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++ _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector ++ ++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); ++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); ++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); ++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); ++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); ++ ++ for (int i = 0; i<(num_points % 4); ++i) ++ { ++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); ++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); ++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); ++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); ++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); ++ } ++ ++ *VE_out = dotProduct_VE; ++ *E_out = dotProduct_E; ++ *P_out = dotProduct_P; ++ *L_out = dotProduct_L; ++ *VL_out = dotProduct_VL; ++} ++#endif /* LV_HAVE_AVX */ ++ ++#ifdef LV_HAVE_SSE3 ++#include ++ /*! ++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code VE PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param VL_code VL PRN code replica input ++ \param VE_out VE correlation output ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param VL_out VL correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++{ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ lv_32fc_t dotProduct_VE; ++ lv_32fc_t dotProduct_E; ++ lv_32fc_t dotProduct_P; ++ lv_32fc_t dotProduct_L; ++ lv_32fc_t dotProduct_VL; ++ ++ // Aux vars ++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; ++ __m128 bb_signal_sample, bb_signal_sample_shuffled; ++ ++ z_VE = _mm_setzero_ps(); ++ z_E = _mm_setzero_ps(); ++ z_P = _mm_setzero_ps(); ++ z_L = _mm_setzero_ps(); ++ z_VL = _mm_setzero_ps(); ++ ++ //input and output vectors ++ const lv_32fc_t* _input = input; ++ const lv_32fc_t* _carrier = carrier; ++ const lv_32fc_t* _VE_code = VE_code; ++ const lv_32fc_t* _E_code = E_code; ++ const lv_32fc_t* _P_code = P_code; ++ const lv_32fc_t* _L_code = L_code; ++ const lv_32fc_t* _VL_code = VL_code; ++ ++ for(;number < halfPoints; number++) ++ { ++ // carrier wipe-off (vector point-to-point product) ++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++ ++ // correlation VE,E,P,L,VL (5x vector scalar product) ++ // VE ++ y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together ++ ++ // Early ++ y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together ++ ++ // Prompt ++ y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together ++ ++ // Late ++ y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together ++ ++ // VL ++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together ++ ++ /*pointer increment*/ ++ _carrier += 2; ++ _input += 2; ++ _VE_code += 2; ++ _E_code += 2; ++ _P_code += 2; ++ _L_code +=2; ++ _VL_code +=2; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; ++ ++ _mm_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector ++ ++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); ++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); ++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); ++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); ++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); ++ ++ if((num_points % 2) != 0) ++ { ++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); ++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); ++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); ++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); ++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); ++ } ++ ++ *VE_out = dotProduct_VE; ++ *E_out = dotProduct_E; ++ *P_out = dotProduct_P; ++ *L_out = dotProduct_L; ++ *VL_out = dotProduct_VL; ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code VE PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param VL_code VL PRN code replica input ++ \param VE_out VE correlation output ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param VL_out VL correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++{ ++ lv_32fc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ // perform Early, Prompt and Late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get early, late, and prompt values for each ++ *VE_out += bb_signal_sample * VE_code[i]; ++ *E_out += bb_signal_sample * E_code[i]; ++ *P_out += bb_signal_sample * P_code[i]; ++ *L_out += bb_signal_sample * L_code[i]; ++ *VL_out += bb_signal_sample * VL_code[i]; ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include ++/*! ++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code VE PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param VL_code VL PRN code replica input ++ \param VE_out VE correlation output ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param VL_out VL correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++{ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 4; ++ ++ lv_32fc_t dotProduct_VE; ++ lv_32fc_t dotProduct_E; ++ lv_32fc_t dotProduct_P; ++ lv_32fc_t dotProduct_L; ++ lv_32fc_t dotProduct_VL; ++ ++ // Aux vars ++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; ++ __m256 bb_signal_sample, bb_signal_sample_shuffled; ++ ++ z_VE = _mm256_setzero_ps(); ++ z_E = _mm256_setzero_ps(); ++ z_P = _mm256_setzero_ps(); ++ z_L = _mm256_setzero_ps(); ++ z_VL = _mm256_setzero_ps(); ++ ++ //input and output vectors ++ const lv_32fc_t* _input = input; ++ const lv_32fc_t* _carrier = carrier; ++ const lv_32fc_t* _VE_code = VE_code; ++ const lv_32fc_t* _E_code = E_code; ++ const lv_32fc_t* _P_code = P_code; ++ const lv_32fc_t* _L_code = L_code; ++ const lv_32fc_t* _VL_code = VL_code; ++ ++ for(;number < halfPoints; number++) ++ { ++ // carrier wipe-off (vector point-to-point product) ++ x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++ ++ // correlation VE,E,P,L,VL (5x vector scalar product) ++ // VE ++ y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together ++ ++ // Early ++ y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together ++ ++ // Prompt ++ y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together ++ ++ // Late ++ y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together ++ ++ // VL ++ y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together ++ ++ /*pointer increment*/ ++ _carrier += 4; ++ _input += 4; ++ _VE_code += 4; ++ _E_code += 4; ++ _P_code += 4; ++ _L_code += 4; ++ _VL_code += 4; ++ } ++ ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; ++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; ++ ++ _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector ++ _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++ _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++ _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++ _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector ++ ++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); ++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); ++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); ++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); ++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); ++ ++ for (int i = 0; i<(num_points % 4); ++i) ++ { ++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); ++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); ++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); ++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); ++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); ++ } ++ ++ *VE_out = dotProduct_VE; ++ *E_out = dotProduct_E; ++ *P_out = dotProduct_P; ++ *L_out = dotProduct_L; ++ *VL_out = dotProduct_VL; ++} ++#endif /* LV_HAVE_AVX */ ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code VE PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param VL_code VL PRN code replica input ++ \param VE_out VE correlation output ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param VL_out VL correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++{ ++ unsigned int number = 0; ++ const unsigned int halfPoints = num_points / 2; ++ ++ lv_32fc_t dotProduct_VE; ++ lv_32fc_t dotProduct_E; ++ lv_32fc_t dotProduct_P; ++ lv_32fc_t dotProduct_L; ++ lv_32fc_t dotProduct_VL; ++ ++ // Aux vars ++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; ++ __m128 bb_signal_sample, bb_signal_sample_shuffled; ++ ++ z_VE = _mm_setzero_ps(); ++ z_E = _mm_setzero_ps(); ++ z_P = _mm_setzero_ps(); ++ z_L = _mm_setzero_ps(); ++ z_VL = _mm_setzero_ps(); ++ ++ //input and output vectors ++ const lv_32fc_t* _input = input; ++ const lv_32fc_t* _carrier = carrier; ++ const lv_32fc_t* _VE_code = VE_code; ++ const lv_32fc_t* _E_code = E_code; ++ const lv_32fc_t* _P_code = P_code; ++ const lv_32fc_t* _L_code = L_code; ++ const lv_32fc_t* _VL_code = VL_code; ++ ++ for(;number < halfPoints; number++) ++ { ++ // carrier wipe-off (vector point-to-point product) ++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ ++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++ ++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++ ++ // correlation VE,E,P,L,VL (5x vector scalar product) ++ // VE ++ y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together ++ ++ // Early ++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together ++ ++ // Prompt ++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together ++ ++ // Late ++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together ++ ++ // VL ++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++ y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++ ++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++ ++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++ ++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together ++ ++ /*pointer increment*/ ++ _carrier += 2; ++ _input += 2; ++ _VE_code += 2; ++ _E_code += 2; ++ _P_code += 2; ++ _L_code +=2; ++ _VL_code +=2; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; ++ ++ _mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector ++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++ _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector ++ ++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); ++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); ++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); ++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); ++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); ++ ++ if((num_points % 2) != 0) ++ { ++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); ++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); ++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); ++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); ++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); ++ } ++ ++ *VE_out = dotProduct_VE; ++ *E_out = dotProduct_E; ++ *P_out = dotProduct_P; ++ *L_out = dotProduct_L; ++ *VL_out = dotProduct_VL; ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code VE PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param VL_code VL PRN code replica input ++ \param VE_out VE correlation output ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param VL_out VL correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++{ ++ lv_32fc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ // perform Early, Prompt and Late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get early, late, and prompt values for each ++ *VE_out += bb_signal_sample * VE_code[i]; ++ *E_out += bb_signal_sample * E_code[i]; ++ *P_out += bb_signal_sample * P_code[i]; ++ *L_out += bb_signal_sample * L_code[i]; ++ *VL_out += bb_signal_sample * VL_code[i]; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,243 @@ ++/*! ++ * \file volk_gnsssdr_64f_accumulator_64f.h ++ * \brief Volk protokernel: 64 bits (double) scalar accumulator ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that implements an accumulator of char values ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H ++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){ ++ double returnValue = 0; ++ const unsigned int sse_iters = num_points / 4; ++ ++ const double* aPtr = inputBuffer; ++ ++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; ++ __m256d accumulator = _mm256_setzero_pd(); ++ __m256d aVal = _mm256_setzero_pd(); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ aVal = _mm256_loadu_pd(aPtr); ++ accumulator = _mm256_add_pd(accumulator, aVal); ++ aPtr += 4; ++ } ++ ++ _mm256_storeu_pd((double*)tempBuffer,accumulator); ++ ++ for(int i = 0; i<4; ++i){ ++ returnValue += tempBuffer[i]; ++ } ++ ++ for(int i = 0; i<(num_points % 4); ++i){ ++ returnValue += (*aPtr++); ++ } ++ ++ *result = returnValue; ++} ++#endif /* LV_HAVE_AVX */ ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){ ++ double returnValue = 0; ++ const unsigned int sse_iters = num_points / 2; ++ ++ const double* aPtr = inputBuffer; ++ ++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; ++ __m128d accumulator = _mm_setzero_pd(); ++ __m128d aVal = _mm_setzero_pd(); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ aVal = _mm_loadu_pd(aPtr); ++ accumulator = _mm_add_pd(accumulator, aVal); ++ aPtr += 2; ++ } ++ ++ _mm_storeu_pd((double*)tempBuffer,accumulator); ++ ++ for(int i = 0; i<2; ++i){ ++ returnValue += tempBuffer[i]; ++ } ++ ++ for(int i = 0; i<(num_points % 2); ++i){ ++ returnValue += (*aPtr++); ++ } ++ ++ *result = returnValue; ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){ ++ const double* aPtr = inputBuffer; ++ double returnValue = 0; ++ ++ for(unsigned int number = 0;number < num_points; number++){ ++ returnValue += (*aPtr++); ++ } ++ *result = returnValue; ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H ++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){ ++ double returnValue = 0; ++ const unsigned int sse_iters = num_points / 4; ++ ++ const double* aPtr = inputBuffer; ++ ++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; ++ __m256d accumulator = _mm256_setzero_pd(); ++ __m256d aVal = _mm256_setzero_pd(); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ aVal = _mm256_load_pd(aPtr); ++ accumulator = _mm256_add_pd(accumulator, aVal); ++ aPtr += 4; ++ } ++ ++ _mm256_store_pd((double*)tempBuffer,accumulator); ++ ++ for(int i = 0; i<4; ++i){ ++ returnValue += tempBuffer[i]; ++ } ++ ++ for(int i = 0; i<(num_points % 4); ++i){ ++ returnValue += (*aPtr++); ++ } ++ ++ *result = returnValue; ++} ++#endif /* LV_HAVE_AVX */ ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){ ++ double returnValue = 0; ++ const unsigned int sse_iters = num_points / 2; ++ ++ const double* aPtr = inputBuffer; ++ ++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; ++ __m128d accumulator = _mm_setzero_pd(); ++ __m128d aVal = _mm_setzero_pd(); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ aVal = _mm_load_pd(aPtr); ++ accumulator = _mm_add_pd(accumulator, aVal); ++ aPtr += 2; ++ } ++ ++ _mm_store_pd((double*)tempBuffer,accumulator); ++ ++ for(int i = 0; i<2; ++i){ ++ returnValue += tempBuffer[i]; ++ } ++ ++ for(int i = 0; i<(num_points % 2); ++i){ ++ returnValue += (*aPtr++); ++ } ++ ++ *result = returnValue; ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){ ++ const double* aPtr = inputBuffer; ++ double returnValue = 0; ++ ++ for(unsigned int number = 0;number < num_points; number++){ ++ returnValue += (*aPtr++); ++ } ++ *result = returnValue; ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */ +\ No newline at end of file +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,183 @@ ++/*! ++ * \file volk_gnsssdr_8i_accumulator_s8i.h ++ * \brief Volk protokernel: 8 bits (char) scalar accumulator ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that implements an accumulator of char values ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H ++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){ ++ char returnValue = 0; ++ const unsigned int sse_iters = num_points / 16; ++ ++ const char* aPtr = inputBuffer; ++ ++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; ++ __m128i accumulator = _mm_setzero_si128(); ++ __m128i aVal = _mm_setzero_si128(); ++ ++ for(unsigned int number = 0; number < sse_iters; number++){ ++ aVal = _mm_lddqu_si128((__m128i*)aPtr); ++ accumulator = _mm_add_epi8(accumulator, aVal); ++ aPtr += 16; ++ } ++ _mm_storeu_si128((__m128i*)tempBuffer,accumulator); ++ ++ for(int i = 0; i<16; ++i){ ++ returnValue += tempBuffer[i]; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i){ ++ returnValue += (*aPtr++); ++ } ++ ++ *result = returnValue; ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){ ++ const char* aPtr = inputBuffer; ++ char returnValue = 0; ++ ++ for(unsigned int number = 0;number < num_points; number++){ ++ returnValue += (*aPtr++); ++ } ++ *result = returnValue; ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H ++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){ ++ char returnValue = 0; ++ const unsigned int sse_iters = num_points / 16; ++ ++ const char* aPtr = inputBuffer; ++ ++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; ++ __m128i accumulator = _mm_setzero_si128(); ++ __m128i aVal = _mm_setzero_si128(); ++ ++ for(unsigned int number = 0; number < sse_iters; number++){ ++ aVal = _mm_load_si128((__m128i*)aPtr); ++ accumulator = _mm_add_epi8(accumulator, aVal); ++ aPtr += 16; ++ } ++ _mm_store_si128((__m128i*)tempBuffer,accumulator); ++ ++ for(int i = 0; i<16; ++i){ ++ returnValue += tempBuffer[i]; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i){ ++ returnValue += (*aPtr++); ++ } ++ ++ *result = returnValue; ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){ ++ const char* aPtr = inputBuffer; ++ char returnValue = 0; ++ ++ for(unsigned int number = 0;number < num_points; number++){ ++ returnValue += (*aPtr++); ++ } ++ *result = returnValue; ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points); ++static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){ ++ ++ short res = 0; ++ char* resc = (char*)&res; ++ resc++; ++ ++ volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points); ++ ++ *result = *resc; ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H */ ++ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,493 @@ ++/*! ++ * \file volk_gnsssdr_8i_index_max_16u.h ++ * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H ++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include "immintrin.h" ++/*! ++ \brief Returns the index of the max value in src0 ++ \param target The index of the max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 32; ++ ++ char* basePtr = (char*)src0; ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ unsigned int index = 0; ++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; ++ __m256i ones, compareResults, currentValues; ++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi; ++ ++ ones = _mm256_set1_epi8(0xFF); ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); ++ ++ lo = _mm256_castsi256_si128(currentValues); ++ hi = _mm256_extractf128_si256(currentValues,1); ++ ++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo); ++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi); ++ ++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h ++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); ++ ++ if (!_mm256_testc_si256(compareResults, ones)) ++ { ++ _mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues); ++ ++ for(int i = 0; i < 32; i++) ++ { ++ if(currentValuesBuffer[i] > max) ++ { ++ index = inputPtr - basePtr + i; ++ max = currentValuesBuffer[i]; ++ } ++ } ++ maxValues = _mm_set1_epi8(max); ++ } ++ ++ inputPtr += 32; ++ } ++ ++ for(int i = 0; i<(num_points % 32); ++i) ++ { ++ if(src0[i] > max) ++ { ++ index = i; ++ max = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_AVX*/ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++/*! ++ \brief Returns the index of the max value in src0 ++ \param target The index of the max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* basePtr = (char*)src0; ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ unsigned int index = 0; ++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++ __m128i maxValues, compareResults, currentValues; ++ ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm_lddqu_si128((__m128i*)inputPtr); ++ ++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++ ++ if (!_mm_test_all_ones(compareResults)) ++ { ++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); ++ ++ for(int i = 0; i < 16; i++) ++ { ++ if(currentValuesBuffer[i] > max) ++ { ++ index = inputPtr - basePtr + i; ++ max = currentValuesBuffer[i]; ++ } ++ } ++ maxValues = _mm_set1_epi8(max); ++ } ++ ++ inputPtr += 16; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ if(src0[i] > max) ++ { ++ index = i; ++ max = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Returns the index of the max value in src0 ++ \param target The index of the max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* basePtr = (char*)src0; ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ unsigned int index = 0; ++ unsigned short mask; ++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++ __m128i maxValues, compareResults, currentValues; ++ ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); ++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++ mask = _mm_movemask_epi8(compareResults); ++ ++ if (mask != 0xFFFF) ++ { ++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); ++ mask = ~mask; ++ int i = 0; ++ while (mask > 0) ++ { ++ if ((mask & 1) == 1) ++ { ++ if(currentValuesBuffer[i] > max) ++ { ++ index = inputPtr - basePtr + i; ++ max = currentValuesBuffer[i]; ++ } ++ } ++ i++; ++ mask >>= 1; ++ } ++ maxValues = _mm_set1_epi8(max); ++ } ++ inputPtr += 16; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ if(src0[i] > max) ++ { ++ index = i; ++ max = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE2*/ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Returns the index of the max value in src0 ++ \param target The index of the max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) { ++ ++ if(num_points > 0) ++ { ++ char max = src0[0]; ++ unsigned int index = 0; ++ ++ for(unsigned int i = 1; i < num_points; ++i) ++ { ++ if(src0[i] > max) ++ { ++ index = i; ++ max = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H*/ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H ++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include "immintrin.h" ++/*! ++ \brief Returns the index of the max value in src0 ++ \param target The index of the max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 32; ++ ++ char* basePtr = (char*)src0; ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ unsigned int index = 0; ++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; ++ __m256i ones, compareResults, currentValues; ++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi; ++ ++ ones = _mm256_set1_epi8(0xFF); ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm256_load_si256((__m256i*)inputPtr); ++ ++ lo = _mm256_castsi256_si128(currentValues); ++ hi = _mm256_extractf128_si256(currentValues,1); ++ ++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo); ++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi); ++ ++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h ++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); ++ ++ if (!_mm256_testc_si256(compareResults, ones)) ++ { ++ _mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues); ++ ++ for(int i = 0; i < 32; i++) ++ { ++ if(currentValuesBuffer[i] > max) ++ { ++ index = inputPtr - basePtr + i; ++ max = currentValuesBuffer[i]; ++ } ++ } ++ maxValues = _mm_set1_epi8(max); ++ } ++ ++ inputPtr += 32; ++ } ++ ++ for(int i = 0; i<(num_points % 32); ++i) ++ { ++ if(src0[i] > max) ++ { ++ index = i; ++ max = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_AVX*/ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "emmintrin.h" ++/*! ++ \brief Returns the index of the max value in src0 ++ \param target The index of the max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* basePtr = (char*)src0; ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ unsigned int index = 0; ++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++ __m128i maxValues, compareResults, currentValues; ++ ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm_load_si128((__m128i*)inputPtr); ++ ++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++ ++ if (!_mm_test_all_ones(compareResults)) ++ { ++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); ++ ++ for(int i = 0; i < 16; i++) ++ { ++ if(currentValuesBuffer[i] > max) ++ { ++ index = inputPtr - basePtr + i; ++ max = currentValuesBuffer[i]; ++ } ++ } ++ maxValues = _mm_set1_epi8(max); ++ } ++ ++ inputPtr += 16; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ if(src0[i] > max) ++ { ++ index = i; ++ max = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++/*! ++ \brief Returns the index of the max value in src0 ++ \param target The index of the max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* basePtr = (char*)src0; ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ unsigned int index = 0; ++ unsigned short mask; ++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++ __m128i maxValues, compareResults, currentValues; ++ ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm_load_si128((__m128i*)inputPtr); ++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++ mask = _mm_movemask_epi8(compareResults); ++ ++ if (mask != 0xFFFF) ++ { ++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); ++ mask = ~mask; ++ int i = 0; ++ while (mask > 0) ++ { ++ if ((mask & 1) == 1) ++ { ++ if(currentValuesBuffer[i] > max) ++ { ++ index = inputPtr - basePtr + i; ++ max = currentValuesBuffer[i]; ++ } ++ } ++ i++; ++ mask >>= 1; ++ } ++ maxValues = _mm_set1_epi8(max); ++ } ++ inputPtr += 16; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ if(src0[i] > max) ++ { ++ index = i; ++ max = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_SSE2*/ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Returns the index of the max value in src0 ++ \param target The index of the max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_index_max_16u_a_generic(unsigned int* target, const char* src0, unsigned int num_points) { ++ ++ if(num_points > 0) ++ { ++ char max = src0[0]; ++ unsigned int index = 0; ++ ++ for(unsigned int i = 1; i < num_points; ++i) ++ { ++ if(src0[i] > max) ++ { ++ index = i; ++ max = src0[i]; ++ } ++ } ++ target[0] = index; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,327 @@ ++/*! ++ * \file volk_gnsssdr_8i_max_s8i.h ++ * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_u_H ++#define INCLUDED_volk_gnsssdr_8i_max_s8i_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include ++/*! ++ \brief Returns the max value in src0 ++ \param target The max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; ++ __m128i maxValues, compareResults, currentValues; ++ ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); ++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); ++ inputPtr += 16; ++ } ++ ++ _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); ++ ++ for(int i = 0; i<16; ++i) ++ { ++ if(maxValuesBuffer[i] > max) ++ { ++ max = maxValuesBuffer[i]; ++ } ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ if(src0[i] > max) ++ { ++ max = src0[i]; ++ } ++ } ++ target = max; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Returns the max value in src0 ++ \param target The max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ unsigned short mask; ++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++ __m128i maxValues, compareResults, currentValues; ++ ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); ++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++ mask = _mm_movemask_epi8(compareResults); ++ ++ if (mask != 0xFFFF) ++ { ++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); ++ mask = ~mask; ++ int i = 0; ++ while (mask > 0) ++ { ++ if ((mask & 1) == 1) ++ { ++ if(currentValuesBuffer[i] > max) ++ { ++ max = currentValuesBuffer[i]; ++ } ++ } ++ i++; ++ mask >>= 1; ++ } ++ maxValues = _mm_set1_epi8(max); ++ } ++ inputPtr += 16; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ if(src0[i] > max) ++ { ++ max = src0[i]; ++ } ++ } ++ target = max; ++ } ++} ++ ++#endif /*LV_HAVE_SSE2*/ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Returns the max value in src0 ++ \param target The max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) { ++ if(num_points > 0) ++ { ++ char max = src0[0]; ++ ++ for(unsigned int i = 1; i < num_points; ++i) ++ { ++ if(src0[i] > max) ++ { ++ max = src0[i]; ++ } ++ } ++ target = max; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_u_H*/ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_a_H ++#define INCLUDED_volk_gnsssdr_8i_max_s8i_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Returns the max value in src0 ++ \param target The max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; ++ __m128i maxValues, compareResults, currentValues; ++ ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm_load_si128((__m128i*)inputPtr); ++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); ++ inputPtr += 16; ++ } ++ ++ _mm_store_si128((__m128i*)maxValuesBuffer, maxValues); ++ ++ for(int i = 0; i<16; ++i) ++ { ++ if(maxValuesBuffer[i] > max) ++ { ++ max = maxValuesBuffer[i]; ++ } ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ if(src0[i] > max) ++ { ++ max = src0[i]; ++ } ++ } ++ target = max; ++ } ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++/*! ++ \brief Returns the max value in src0 ++ \param target The max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) { ++ if(num_points > 0){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* inputPtr = (char*)src0; ++ char max = src0[0]; ++ unsigned short mask; ++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++ __m128i maxValues, compareResults, currentValues; ++ ++ maxValues = _mm_set1_epi8(max); ++ ++ for(unsigned int number = 0; number < sse_iters; number++) ++ { ++ currentValues = _mm_load_si128((__m128i*)inputPtr); ++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++ mask = _mm_movemask_epi8(compareResults); ++ ++ if (mask != 0xFFFF) ++ { ++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); ++ mask = ~mask; ++ int i = 0; ++ while (mask > 0) ++ { ++ if ((mask & 1) == 1) ++ { ++ if(currentValuesBuffer[i] > max) ++ { ++ max = currentValuesBuffer[i]; ++ } ++ } ++ i++; ++ mask >>= 1; ++ } ++ maxValues = _mm_set1_epi8(max); ++ } ++ inputPtr += 16; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ if(src0[i] > max) ++ { ++ max = src0[i]; ++ } ++ } ++ target = max; ++ } ++} ++ ++#endif /*LV_HAVE_SSE2*/ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Returns the max value in src0 ++ \param target The max value in src0 ++ \param src0 The buffer of data to be analysed ++ \param num_points The number of values in src0 to be analysed ++ */ ++static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) { ++ if(num_points > 0) ++ { ++ if(num_points > 0) ++ { ++ char max = src0[0]; ++ ++ for(unsigned int i = 1; i < num_points; ++i) ++ { ++ if(src0[i] > max) ++ { ++ max = src0[i]; ++ } ++ } ++ target = max; ++ } ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_a_H*/ +\ No newline at end of file +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,184 @@ ++/*! ++ * \file volk_gnsssdr_8i_x2_add_8i.h ++ * \brief Volk protokernel: adds pairs of 8 bits (char) scalars ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that adds pairs of 8 bits (char) scalars ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H ++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H ++ ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include "pmmintrin.h" ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* cPtr = cVector; ++ const char* aPtr = aVector; ++ const char* bPtr= bVector; ++ ++ __m128i aVal, bVal, cVal; ++ ++ for(int number = 0; number < sse_iters; number++){ ++ ++ aVal = _mm_lddqu_si128((__m128i*)aPtr); ++ bVal = _mm_lddqu_si128((__m128i*)bPtr); ++ ++ cVal = _mm_add_epi8(aVal, bVal); ++ ++ _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container ++ ++ aPtr += 16; ++ bPtr += 16; ++ cPtr += 16; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ *cPtr++ = (*aPtr++) + (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++ char* cPtr = cVector; ++ const char* aPtr = aVector; ++ const char* bPtr= bVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) + (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H ++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H ++ ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include "pmmintrin.h" ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 16; ++ ++ char* cPtr = cVector; ++ const char* aPtr = aVector; ++ const char* bPtr= bVector; ++ ++ __m128i aVal, bVal, cVal; ++ ++ for(int number = 0; number < sse_iters; number++){ ++ ++ aVal = _mm_load_si128((__m128i*)aPtr); ++ bVal = _mm_load_si128((__m128i*)bPtr); ++ ++ cVal = _mm_add_epi8(aVal, bVal); ++ ++ _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container ++ ++ aPtr += 16; ++ bPtr += 16; ++ cPtr += 16; ++ } ++ ++ for(int i = 0; i<(num_points % 16); ++i) ++ { ++ *cPtr++ = (*aPtr++) + (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++ char* cPtr = cVector; ++ const char* aPtr = aVector; ++ const char* bPtr= bVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) + (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Adds the two input vectors and store their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be added ++ \param bVector One of the vectors to be added ++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++ */ ++extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points); ++static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++ volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points); ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,326 @@ ++/*! ++ * \file volk_gnsssdr_8ic_conjugate_8ic.h ++ * \brief Volk protokernel: calculates the conjugate of a 16 bits vector ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that calculates the conjugate of a ++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part) ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H ++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include "immintrin.h" ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ ++ __m256 tmp; ++ __m128i tmp128lo, tmp128hi; ++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); ++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); ++ ++ for (int i = 0; i < sse_iters; ++i) ++ { ++ tmp = _mm256_loadu_ps((float*)a); ++ tmp = _mm256_xor_ps(tmp, conjugator1); ++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); ++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); ++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); ++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); ++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h ++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1); ++ _mm256_storeu_ps((float*)c, tmp); ++ ++ a += 16; ++ c += 16; ++ } ++ ++ for (int i = 0; i<(num_points % 16); ++i) ++ { ++ *c++ = lv_conj(*a++); ++ } ++} ++#endif /* LV_HAVE_AVX */ ++ ++#ifdef LV_HAVE_SSSE3 ++#include "tmmintrin.h" ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points / 8; ++ ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ __m128i tmp; ++ ++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); ++ ++ for (int i = 0; i < sse_iters; ++i) ++ { ++ tmp = _mm_lddqu_si128((__m128i*)a); ++ tmp = _mm_sign_epi8(tmp, conjugator); ++ _mm_storeu_si128((__m128i*)c, tmp); ++ a += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = lv_conj(*a++); ++ } ++ ++} ++#endif /* LV_HAVE_SSSE3 */ ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points / 8; ++ ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ __m128i tmp; ++ ++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); ++ ++ for (int i = 0; i < sse_iters; ++i) ++ { ++ tmp = _mm_lddqu_si128((__m128i*)a); ++ tmp = _mm_xor_si128(tmp, conjugator1); ++ tmp = _mm_add_epi8(tmp, conjugator2); ++ _mm_storeu_si128((__m128i*)c, tmp); ++ a += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = lv_conj(*a++); ++ } ++ ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ lv_8sc_t* cPtr = cVector; ++ const lv_8sc_t* aPtr = aVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = lv_conj(*aPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H ++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include "immintrin.h" ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points / 16; ++ ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ ++ __m256 tmp; ++ __m128i tmp128lo, tmp128hi; ++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); ++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); ++ ++ for (int i = 0; i < sse_iters; ++i) ++ { ++ tmp = _mm256_load_ps((float*)a); ++ tmp = _mm256_xor_ps(tmp, conjugator1); ++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); ++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); ++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); ++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); ++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h ++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1); ++ _mm256_store_ps((float*)c, tmp); ++ ++ a += 16; ++ c += 16; ++ } ++ ++ for (int i = 0; i<(num_points % 16); ++i) ++ { ++ *c++ = lv_conj(*a++); ++ } ++} ++#endif /* LV_HAVE_AVX */ ++ ++#ifdef LV_HAVE_SSSE3 ++#include "tmmintrin.h" ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points / 8; ++ ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ __m128i tmp; ++ ++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); ++ ++ for (int i = 0; i < sse_iters; ++i) ++ { ++ tmp = _mm_load_si128((__m128i*)a); ++ tmp = _mm_sign_epi8(tmp, conjugator); ++ _mm_store_si128((__m128i*)c, tmp); ++ a += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = lv_conj(*a++); ++ } ++ ++} ++#endif /* LV_HAVE_SSSE3 */ ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ const unsigned int sse_iters = num_points / 8; ++ ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ __m128i tmp; ++ ++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); ++ ++ for (int i = 0; i < sse_iters; ++i) ++ { ++ tmp = _mm_load_si128((__m128i*)a); ++ tmp = _mm_xor_si128(tmp, conjugator1); ++ tmp = _mm_add_epi8(tmp, conjugator2); ++ _mm_store_si128((__m128i*)c, tmp); ++ a += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = lv_conj(*a++); ++ } ++ ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ lv_8sc_t* cPtr = cVector; ++ const lv_8sc_t* aPtr = aVector; ++ unsigned int number = 0; ++ ++ for(number = 0; number < num_points; number++){ ++ *cPtr++ = lv_conj(*aPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Takes the conjugate of an unsigned char vector. ++ \param cVector The vector where the results will be stored ++ \param aVector Vector to be conjugated ++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++ */ ++extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points); ++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++ volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points); ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,320 @@ ++/*! ++ * \file volk_gnsssdr_8ic_magnitude_squared_8i.h ++ * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that calculates the magnitude squared of a ++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part) ++ * result = (real*real) + (imag*imag) ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H ++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++#include "tmmintrin.h" ++/*! ++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 16; ++ ++ const char* complexVectorPtr = (char*)complexVector; ++ char* magnitudeVectorPtr = magnitudeVector; ++ ++ __m128i zero, result8; ++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; ++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; ++ ++ zero = _mm_setzero_si128(); ++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++ ++ for(int number = 0;number < sse_iters; number++) ++ { ++ avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); ++ avectorlo = _mm_unpacklo_epi8 (avector, zero); ++ avectorhi = _mm_unpackhi_epi8 (avector, zero); ++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); ++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); ++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); ++ ++ complexVectorPtr += 16; ++ ++ bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); ++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero); ++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero); ++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); ++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); ++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); ++ ++ complexVectorPtr += 16; ++ ++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); ++ ++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8); ++ ++ magnitudeVectorPtr += 16; ++ ++ ++ } ++ ++ for (int i = 0; i<(num_points % 16); ++i) ++ { ++ const char valReal = *complexVectorPtr++; ++ const char valImag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); ++ } ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++//#ifdef LV_HAVE_SSE ++//#include ++///*! ++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++// \param complexVector The vector containing the complex input values ++// \param magnitudeVector The vector containing the real output values ++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++// */ ++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++// unsigned int number = 0; ++// const unsigned int quarterPoints = num_points / 4; ++// ++// const float* complexVectorPtr = (float*)complexVector; ++// float* magnitudeVectorPtr = magnitudeVector; ++// ++// __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++// for(;number < quarterPoints; number++){ ++// cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++// complexVectorPtr += 4; ++// ++// cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++// complexVectorPtr += 4; ++// ++// // Arrange in i1i2i3i4 format ++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++// // Arrange in q1q2q3q4 format ++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++// ++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++// ++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++// ++// _mm_storeu_ps(magnitudeVectorPtr, result); ++// magnitudeVectorPtr += 4; ++// } ++// ++// number = quarterPoints * 4; ++// for(; number < num_points; number++){ ++// float val1Real = *complexVectorPtr++; ++// float val1Imag = *complexVectorPtr++; ++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++// } ++//} ++//#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++ const char* complexVectorPtr = (char*)complexVector; ++ char* magnitudeVectorPtr = magnitudeVector; ++ ++ for(int number = 0; number < num_points; number++){ ++ const char real = *complexVectorPtr++; ++ const char imag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H ++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 16; ++ ++ const char* complexVectorPtr = (char*)complexVector; ++ char* magnitudeVectorPtr = magnitudeVector; ++ ++ __m128i zero, result8; ++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; ++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; ++ ++ zero = _mm_setzero_si128(); ++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++ ++ for(int number = 0;number < sse_iters; number++) ++ { ++ avector = _mm_load_si128((__m128i*)complexVectorPtr); ++ avectorlo = _mm_unpacklo_epi8 (avector, zero); ++ avectorhi = _mm_unpackhi_epi8 (avector, zero); ++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); ++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); ++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); ++ ++ complexVectorPtr += 16; ++ ++ bvector = _mm_load_si128((__m128i*)complexVectorPtr); ++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero); ++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero); ++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); ++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); ++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); ++ ++ complexVectorPtr += 16; ++ ++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); ++ ++ _mm_store_si128((__m128i*)magnitudeVectorPtr, result8); ++ ++ magnitudeVectorPtr += 16; ++ ++ ++ } ++ ++ for (int i = 0; i<(num_points % 16); ++i) ++ { ++ const char valReal = *complexVectorPtr++; ++ const char valImag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); ++ } ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++//#ifdef LV_HAVE_SSE ++//#include ++///*! ++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++// \param complexVector The vector containing the complex input values ++// \param magnitudeVector The vector containing the real output values ++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++// */ ++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++// unsigned int number = 0; ++// const unsigned int quarterPoints = num_points / 4; ++// ++// const float* complexVectorPtr = (float*)complexVector; ++// float* magnitudeVectorPtr = magnitudeVector; ++// ++// __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++// for(;number < quarterPoints; number++){ ++// cplxValue1 = _mm_load_ps(complexVectorPtr); ++// complexVectorPtr += 4; ++// ++// cplxValue2 = _mm_load_ps(complexVectorPtr); ++// complexVectorPtr += 4; ++// ++// // Arrange in i1i2i3i4 format ++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++// // Arrange in q1q2q3q4 format ++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++// ++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++// ++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++// ++// _mm_store_ps(magnitudeVectorPtr, result); ++// magnitudeVectorPtr += 4; ++// } ++// ++// number = quarterPoints * 4; ++// for(; number < num_points; number++){ ++// float val1Real = *complexVectorPtr++; ++// float val1Imag = *complexVectorPtr++; ++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++// } ++//} ++//#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++ const char* complexVectorPtr = (char*)complexVector; ++ char* magnitudeVectorPtr = magnitudeVector; ++ ++ for(int number = 0; number < num_points; number++){ ++ const char real = *complexVectorPtr++; ++ const char imag = *complexVectorPtr++; ++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++ \param complexVector The vector containing the complex input values ++ \param magnitudeVector The vector containing the real output values ++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++ */ ++extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points); ++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++ volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points); ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,271 @@ ++/*! ++ * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h ++ * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that multiplies a group of 16 bits vectors ++ * (8 bits the real part and 8 bits the imaginary part) by one constant vector ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H ++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector The vector to be multiplied ++ \param scalar The complex scalar to multiply aVector ++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++ ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ y = _mm_set1_epi16 (*(short*)&scalar); ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ for(int number = 0;number < sse_iters; number++){ ++ ++ x = _mm_lddqu_si128((__m128i*)a); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ realc = _mm_and_si128 (realc, mult1); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ imagc = _mm_and_si128 (imagc, mult1); ++ imagc = _mm_slli_si128 (imagc, 1); ++ ++ totalc = _mm_or_si128 (realc, imagc); ++ ++ _mm_storeu_si128((__m128i*)c, totalc); ++ ++ a += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = (*a++) * scalar; ++ } ++ ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector The vector to be multiplied ++ \param scalar The complex scalar to multiply aVector ++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++ ++ /*lv_8sc_t* cPtr = cVector; ++ const lv_8sc_t* aPtr = aVector; ++ ++ for (int i = 0; i= 8){ ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ number -= 8; ++ } ++ ++ // clean up any remaining ++ while (number-- > 0) ++ *cPtr++ = *aPtr++ * scalar; ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H ++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++/*! ++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector The vector to be multiplied ++ \param scalar The complex scalar to multiply aVector ++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++ ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ y = _mm_set1_epi16 (*(short*)&scalar); ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ for(int number = 0;number < sse_iters; number++){ ++ ++ x = _mm_load_si128((__m128i*)a); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ realc = _mm_and_si128 (realc, mult1); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ imagc = _mm_and_si128 (imagc, mult1); ++ imagc = _mm_slli_si128 (imagc, 1); ++ ++ totalc = _mm_or_si128 (realc, imagc); ++ ++ _mm_store_si128((__m128i*)c, totalc); ++ ++ a += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = (*a++) * scalar; ++ } ++ ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector The vector to be multiplied ++ \param scalar The complex scalar to multiply aVector ++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++ ++ /*lv_8sc_t* cPtr = cVector; ++ const lv_8sc_t* aPtr = aVector; ++ ++ for (int i = 0; i= 8){ ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ *cPtr++ = (*aPtr++) * scalar; ++ number -= 8; ++ } ++ ++ // clean up any remaining ++ while (number-- > 0) ++ *cPtr++ = *aPtr++ * scalar; ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector The vector to be multiplied ++ \param scalar The complex scalar to multiply aVector ++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++ */ ++extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points); ++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++ volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points); ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,499 @@ ++/*! ++ * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h ++ * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part ++ * and 8 bits the imaginary part) and accumulates them ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H ++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H ++ ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++ \param cVector The vector where the accumulated result will be stored ++ \param aVector One of the vectors to be multiplied and accumulated ++ \param bVector One of the vectors to be multiplied and accumulated ++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++ ++ /*lv_8sc_t* cPtr = result; ++ const lv_8sc_t* aPtr = input; ++ const lv_8sc_t* bPtr = taps; ++ ++ for(int number = 0; number < num_points; number++){ ++ *cPtr += (*aPtr++) * (*bPtr++); ++ }*/ ++ ++ char * res = (char*) result; ++ char * in = (char*) input; ++ char * tp = (char*) taps; ++ unsigned int n_2_ccomplex_blocks = num_points/2; ++ unsigned int isodd = num_points & 1; ++ ++ char sum0[2] = {0,0}; ++ char sum1[2] = {0,0}; ++ unsigned int i = 0; ++ ++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++ ++ in += 4; ++ tp += 4; ++ } ++ ++ res[0] = sum0[0] + sum1[0]; ++ res[1] = sum0[1] + sum1[1]; ++ ++ // Cleanup if we had an odd number of points ++ for(i = 0; i < isodd; ++i) { ++ *result += input[num_points - 1] * taps[num_points - 1]; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++/*! ++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++ \param cVector The vector where the accumulated result will be stored ++ \param aVector One of the vectors to be multiplied and accumulated ++ \param bVector One of the vectors to be multiplied and accumulated ++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++ ++ lv_8sc_t dotProduct; ++ memset(&dotProduct, 0x0, 2*sizeof(char)); ++ ++ const lv_8sc_t* a = input; ++ const lv_8sc_t* b = taps; ++ ++ const unsigned int sse_iters = num_points/8; ++ ++ if (sse_iters>0) ++ { ++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ realcacc = _mm_setzero_si128(); ++ imagcacc = _mm_setzero_si128(); ++ ++ for(int number = 0; number < sse_iters; number++){ ++ ++ x = _mm_lddqu_si128((__m128i*)a); ++ y = _mm_lddqu_si128((__m128i*)b); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ realcacc = _mm_add_epi16 (realcacc, realc); ++ imagcacc = _mm_add_epi16 (imagcacc, imagc); ++ ++ a += 8; ++ b += 8; ++ } ++ ++ realcacc = _mm_and_si128 (realcacc, mult1); ++ imagcacc = _mm_and_si128 (imagcacc, mult1); ++ imagcacc = _mm_slli_si128 (imagcacc, 1); ++ ++ totalc = _mm_or_si128 (realcacc, imagcacc); ++ ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; ++ ++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<8; ++i) ++ { ++ dotProduct += dotProductVector[i]; ++ } ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ dotProduct += (*a++) * (*b++); ++ } ++ ++ *result = dotProduct; ++} ++ ++#endif /*LV_HAVE_SSE2*/ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++ \param cVector The vector where the accumulated result will be stored ++ \param aVector One of the vectors to be multiplied and accumulated ++ \param bVector One of the vectors to be multiplied and accumulated ++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++ ++ lv_8sc_t dotProduct; ++ memset(&dotProduct, 0x0, 2*sizeof(char)); ++ ++ const lv_8sc_t* a = input; ++ const lv_8sc_t* b = taps; ++ ++ const unsigned int sse_iters = num_points/8; ++ ++ if (sse_iters>0) ++ { ++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ realcacc = _mm_setzero_si128(); ++ imagcacc = _mm_setzero_si128(); ++ ++ for(int number = 0; number < sse_iters; number++){ ++ ++ x = _mm_lddqu_si128((__m128i*)a); ++ y = _mm_lddqu_si128((__m128i*)b); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ realcacc = _mm_add_epi16 (realcacc, realc); ++ imagcacc = _mm_add_epi16 (imagcacc, imagc); ++ ++ a += 8; ++ b += 8; ++ } ++ ++ imagcacc = _mm_slli_si128 (imagcacc, 1); ++ ++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); ++ ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; ++ ++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<8; ++i) ++ { ++ dotProduct += dotProductVector[i]; ++ } ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ dotProduct += (*a++) * (*b++); ++ } ++ ++ *result = dotProduct; ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H*/ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H ++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H ++ ++#include ++#include ++#include ++#include ++ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++ \param cVector The vector where the accumulated result will be stored ++ \param aVector One of the vectors to be multiplied and accumulated ++ \param bVector One of the vectors to be multiplied and accumulated ++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++ ++ /*lv_8sc_t* cPtr = result; ++ const lv_8sc_t* aPtr = input; ++ const lv_8sc_t* bPtr = taps; ++ ++ for(int number = 0; number < num_points; number++){ ++ *cPtr += (*aPtr++) * (*bPtr++); ++ }*/ ++ ++ char * res = (char*) result; ++ char * in = (char*) input; ++ char * tp = (char*) taps; ++ unsigned int n_2_ccomplex_blocks = num_points/2; ++ unsigned int isodd = num_points & 1; ++ ++ char sum0[2] = {0,0}; ++ char sum1[2] = {0,0}; ++ unsigned int i = 0; ++ ++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++ ++ in += 4; ++ tp += 4; ++ } ++ ++ res[0] = sum0[0] + sum1[0]; ++ res[1] = sum0[1] + sum1[1]; ++ ++ // Cleanup if we had an odd number of points ++ for(i = 0; i < isodd; ++i) { ++ *result += input[num_points - 1] * taps[num_points - 1]; ++ } ++} ++ ++#endif /*LV_HAVE_GENERIC*/ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++/*! ++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++ \param cVector The vector where the accumulated result will be stored ++ \param aVector One of the vectors to be multiplied and accumulated ++ \param bVector One of the vectors to be multiplied and accumulated ++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++ ++ lv_8sc_t dotProduct; ++ memset(&dotProduct, 0x0, 2*sizeof(char)); ++ ++ const lv_8sc_t* a = input; ++ const lv_8sc_t* b = taps; ++ ++ const unsigned int sse_iters = num_points/8; ++ ++ if (sse_iters>0) ++ { ++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ realcacc = _mm_setzero_si128(); ++ imagcacc = _mm_setzero_si128(); ++ ++ for(int number = 0; number < sse_iters; number++){ ++ ++ x = _mm_load_si128((__m128i*)a); ++ y = _mm_load_si128((__m128i*)b); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ realcacc = _mm_add_epi16 (realcacc, realc); ++ imagcacc = _mm_add_epi16 (imagcacc, imagc); ++ ++ a += 8; ++ b += 8; ++ } ++ ++ realcacc = _mm_and_si128 (realcacc, mult1); ++ imagcacc = _mm_and_si128 (imagcacc, mult1); ++ imagcacc = _mm_slli_si128 (imagcacc, 1); ++ ++ totalc = _mm_or_si128 (realcacc, imagcacc); ++ ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; ++ ++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<8; ++i) ++ { ++ dotProduct += dotProductVector[i]; ++ } ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ dotProduct += (*a++) * (*b++); ++ } ++ ++ *result = dotProduct; ++} ++ ++#endif /*LV_HAVE_SSE2*/ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++ \param cVector The vector where the accumulated result will be stored ++ \param aVector One of the vectors to be multiplied and accumulated ++ \param bVector One of the vectors to be multiplied and accumulated ++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++ ++ lv_8sc_t dotProduct; ++ memset(&dotProduct, 0x0, 2*sizeof(char)); ++ ++ const lv_8sc_t* a = input; ++ const lv_8sc_t* b = taps; ++ ++ const unsigned int sse_iters = num_points/8; ++ ++ if (sse_iters>0) ++ { ++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ realcacc = _mm_setzero_si128(); ++ imagcacc = _mm_setzero_si128(); ++ ++ for(int number = 0; number < sse_iters; number++){ ++ ++ x = _mm_load_si128((__m128i*)a); ++ y = _mm_load_si128((__m128i*)b); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ realcacc = _mm_add_epi16 (realcacc, realc); ++ imagcacc = _mm_add_epi16 (imagcacc, imagc); ++ ++ a += 8; ++ b += 8; ++ } ++ ++ imagcacc = _mm_slli_si128 (imagcacc, 1); ++ ++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); ++ ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; ++ ++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<8; ++i) ++ { ++ dotProduct += dotProductVector[i]; ++ } ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ dotProduct += (*a++) * (*b++); ++ } ++ ++ *result = dotProduct; ++} ++ ++#endif /*LV_HAVE_SSE4_1*/ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++ \param cVector The vector where the accumulated result will be stored ++ \param aVector One of the vectors to be multiplied and accumulated ++ \param bVector One of the vectors to be multiplied and accumulated ++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++ */ ++extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points); ++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){ ++ ++ short resReal = 0; ++ char* resRealChar = (char*)&resReal; ++ resRealChar++; ++ ++ short resImag = 0; ++ char* resImagChar = (char*)&resImag; ++ resImagChar++; ++ ++ volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points); ++ ++ *result = lv_cmake(*resRealChar, *resImagChar); ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,346 @@ ++/*! ++ * \file volk_gnsssdr_8ic_x2_multiply_8ic.h ++ * \brief Volk protokernel: multiplies two 16 bits vectors ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part ++ * and 8 bits the imaginary part) ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H ++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++/*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ const lv_8sc_t* b = bVector; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ for(int number = 0;number < sse_iters; number++){ ++ ++ x = _mm_lddqu_si128((__m128i*)a); ++ y = _mm_lddqu_si128((__m128i*)b); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ realc = _mm_and_si128 (realc, mult1); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ imagc = _mm_and_si128 (imagc, mult1); ++ imagc = _mm_slli_si128 (imagc, 1); ++ ++ totalc = _mm_or_si128 (realc, imagc); ++ ++ _mm_storeu_si128((__m128i*)c, totalc); ++ ++ a += 8; ++ b += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = (*a++) * (*b++); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, zero; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ const lv_8sc_t* b = bVector; ++ ++ zero = _mm_setzero_si128(); ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ for(int number = 0;number < sse_iters; number++){ ++ ++ x = _mm_lddqu_si128((__m128i*)a); ++ y = _mm_lddqu_si128((__m128i*)b); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ imagc = _mm_slli_si128 (imagc, 1); ++ ++ totalc = _mm_blendv_epi8 (imagc, realc, mult1); ++ ++ _mm_storeu_si128((__m128i*)c, totalc); ++ ++ a += 8; ++ b += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = (*a++) * (*b++); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++ lv_8sc_t* cPtr = cVector; ++ const lv_8sc_t* aPtr = aVector; ++ const lv_8sc_t* bPtr = bVector; ++ ++ for(int number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) * (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H ++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++/*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ const lv_8sc_t* b = bVector; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ for(int number = 0;number < sse_iters; number++){ ++ ++ x = _mm_load_si128((__m128i*)a); ++ y = _mm_load_si128((__m128i*)b); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ realc = _mm_and_si128 (realc, mult1); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ imagc = _mm_and_si128 (imagc, mult1); ++ imagc = _mm_slli_si128 (imagc, 1); ++ ++ totalc = _mm_or_si128 (realc, imagc); ++ ++ _mm_store_si128((__m128i*)c, totalc); ++ ++ a += 8; ++ b += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = (*a++) * (*b++); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, zero; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++ lv_8sc_t* c = cVector; ++ const lv_8sc_t* a = aVector; ++ const lv_8sc_t* b = bVector; ++ ++ zero = _mm_setzero_si128(); ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ for(int number = 0;number < sse_iters; number++){ ++ ++ x = _mm_load_si128((__m128i*)a); ++ y = _mm_load_si128((__m128i*)b); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ imagc = _mm_slli_si128 (imagc, 1); ++ ++ totalc = _mm_blendv_epi8 (imagc, realc, mult1); ++ ++ _mm_store_si128((__m128i*)c, totalc); ++ ++ a += 8; ++ b += 8; ++ c += 8; ++ } ++ ++ for (int i = 0; i<(num_points % 8); ++i) ++ { ++ *c++ = (*a++) * (*b++); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++ lv_8sc_t* cPtr = cVector; ++ const lv_8sc_t* aPtr = aVector; ++ const lv_8sc_t* bPtr = bVector; ++ ++ for(int number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) * (*bPtr++); ++ } ++ ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++ \param cVector The vector where the results will be stored ++ \param aVector One of the vectors to be multiplied ++ \param bVector One of the vectors to be multiplied ++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++ */ ++extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points); ++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++ volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points); ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,613 @@ ++/*! ++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the ++ * real part and 8 bits the imaginary part), and accumulates the result ++ * in 32 bits single point values, returning float32 values: ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 16 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 E_code_acc, P_code_acc, L_code_acc; ++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; ++ __m128 output_ps; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ E_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ ++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 E_code_acc, P_code_acc, L_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 output_ps_1, output_ps_2; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ E_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ ++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ lv_8sc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get early, late, and prompt values for each ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 E_code_acc, P_code_acc, L_code_acc; ++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; ++ __m128 output_ps; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ E_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_load_si128((__m128i*)input_ptr); ++ y = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get early values ++ y = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); ++ ++ //Get prompt values ++ y = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); ++ ++ //Get late values ++ y = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ ++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 E_code_acc, P_code_acc, L_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 output_ps_1, output_ps_2; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ E_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_load_si128((__m128i*)input_ptr); ++ y = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get early values ++ y = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++ ++ //Get prompt values ++ y = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++ ++ //Get late values ++ y = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ ++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ lv_8sc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get early, late, and prompt values for each ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */ +\ No newline at end of file +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,874 @@ ++/*! ++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the ++ * real part and 8 bits the imaginary part): ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 16 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 16 bits vectors), accumulating the results ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 16 bits vectors), accumulating the results ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 16 bits vectors), accumulating the results ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++ /*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_8sc_t* E_out_ptr = E_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_8sc_t* L_out_ptr = L_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_8sc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ real_E_code_acc = _mm_setzero_si128(); ++ imag_E_code_acc = _mm_setzero_si128(); ++ real_L_code_acc = _mm_setzero_si128(); ++ imag_L_code_acc = _mm_setzero_si128(); ++ real_P_code_acc = _mm_setzero_si128(); ++ imag_P_code_acc = _mm_setzero_si128(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); ++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); ++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); ++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ E_code_ptr += 8; ++ L_code_ptr += 8; ++ P_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; ++ ++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); ++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); ++ _mm_storeu_si128((__m128i*)E_dotProductVector, output); ++ ++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); ++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); ++ _mm_storeu_si128((__m128i*)L_dotProductVector, output); ++ ++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); ++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); ++ _mm_storeu_si128((__m128i*)P_dotProductVector, output); ++ ++ for (int i = 0; i<8; ++i) ++ { ++ *E_out_ptr += E_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); ++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); ++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); ++ } ++} ++ ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_8sc_t* E_out_ptr = E_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_8sc_t* L_out_ptr = L_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_8sc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ real_E_code_acc = _mm_setzero_si128(); ++ imag_E_code_acc = _mm_setzero_si128(); ++ real_L_code_acc = _mm_setzero_si128(); ++ imag_L_code_acc = _mm_setzero_si128(); ++ real_P_code_acc = _mm_setzero_si128(); ++ imag_P_code_acc = _mm_setzero_si128(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); ++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); ++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); ++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ E_code_ptr += 8; ++ L_code_ptr += 8; ++ P_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; ++ ++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); ++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); ++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); ++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); ++ _mm_storeu_si128((__m128i*)E_dotProductVector, output); ++ ++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); ++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); ++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); ++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); ++ _mm_storeu_si128((__m128i*)L_dotProductVector, output); ++ ++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); ++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); ++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); ++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); ++ _mm_storeu_si128((__m128i*)P_dotProductVector, output); ++ ++ for (int i = 0; i<8; ++i) ++ { ++ *E_out_ptr += E_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); ++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); ++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); ++ } ++} ++ ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ lv_8sc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get early, late, and prompt values for each ++ *E_out += bb_signal_sample * E_code[i]; ++ *P_out += bb_signal_sample * P_code[i]; ++ *L_out += bb_signal_sample * L_code[i]; ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_8sc_t* E_out_ptr = E_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_8sc_t* L_out_ptr = L_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_8sc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ real_E_code_acc = _mm_setzero_si128(); ++ imag_E_code_acc = _mm_setzero_si128(); ++ real_L_code_acc = _mm_setzero_si128(); ++ imag_L_code_acc = _mm_setzero_si128(); ++ real_P_code_acc = _mm_setzero_si128(); ++ imag_P_code_acc = _mm_setzero_si128(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_load_si128((__m128i*)input_ptr); ++ y = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ //Get early values ++ y = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); ++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); ++ ++ //Get late values ++ y = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); ++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); ++ ++ //Get prompt values ++ y = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); ++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ E_code_ptr += 8; ++ L_code_ptr += 8; ++ P_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; ++ ++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); ++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); ++ _mm_store_si128((__m128i*)E_dotProductVector, output); ++ ++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); ++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); ++ _mm_store_si128((__m128i*)L_dotProductVector, output); ++ ++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); ++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); ++ _mm_store_si128((__m128i*)P_dotProductVector, output); ++ ++ for (int i = 0; i<8; ++i) ++ { ++ *E_out_ptr += E_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); ++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); ++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); ++ } ++} ++ ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_8sc_t* E_out_ptr = E_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_8sc_t* L_out_ptr = L_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_8sc_t* P_out_ptr = P_out; ++ ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ real_E_code_acc = _mm_setzero_si128(); ++ imag_E_code_acc = _mm_setzero_si128(); ++ real_L_code_acc = _mm_setzero_si128(); ++ imag_L_code_acc = _mm_setzero_si128(); ++ real_P_code_acc = _mm_setzero_si128(); ++ imag_P_code_acc = _mm_setzero_si128(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_load_si128((__m128i*)input_ptr); ++ y = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ //Get early values ++ y = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); ++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); ++ ++ //Get late values ++ y = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); ++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); ++ ++ //Get prompt values ++ y = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); ++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ E_code_ptr += 8; ++ L_code_ptr += 8; ++ P_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; ++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; ++ ++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); ++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); ++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); ++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); ++ _mm_store_si128((__m128i*)E_dotProductVector, output); ++ ++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); ++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); ++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); ++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); ++ _mm_store_si128((__m128i*)L_dotProductVector, output); ++ ++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); ++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); ++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); ++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); ++ _mm_store_si128((__m128i*)P_dotProductVector, output); ++ ++ for (int i = 0; i<8; ++i) ++ { ++ *E_out_ptr += E_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get early, late, and prompt values for each ++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); ++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); ++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); ++ } ++} ++ ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++{ ++ lv_8sc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ // perform Early, Prompt and Late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get early, late, and prompt values for each ++ *E_out += bb_signal_sample * E_code[i]; ++ *P_out += bb_signal_sample * P_code[i]; ++ *L_out += bb_signal_sample * L_code[i]; ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param E_code Early PRN code replica input ++ \param P_code Early PRN code replica input ++ \param L_code Early PRN code replica input ++ \param E_out Early correlation output ++ \param P_out Early correlation output ++ \param L_out Early correlation output ++ \param num_points The number of complex values in vectors ++ */ ++ ++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points); ++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points); ++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){ ++ ++ short E_out_real = 0; ++ short E_out_imag = 0; ++ char* E_out_real_c = (char*)&E_out_real; ++ E_out_real_c++; ++ char* E_out_imag_c = (char*)&E_out_imag; ++ E_out_imag_c++; ++ ++ short P_out_real = 0; ++ short P_out_imag = 0; ++ char* P_out_real_c = (char*)&P_out_real; ++ P_out_real_c++; ++ char* P_out_imag_c = (char*)&P_out_imag; ++ P_out_imag_c++; ++ ++ short L_out_real = 0; ++ short L_out_imag = 0; ++ char* L_out_real_c = (char*)&L_out_real; ++ L_out_real_c++; ++ char* L_out_imag_c = (char*)&L_out_imag; ++ L_out_imag_c++; ++ ++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points); ++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points); ++ ++ //ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that ++ //in one function the length of the code gives memory problems (bad access, segmentation fault). ++ //Also, the maximum number of accumulators that can be used is 4 (and we need 6). ++ //The "carrier wipe-off" step is done two times: one in the first function and another one in the second. ++ //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just ++ //one time. ++ ++ *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c); ++ *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c); ++ *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c); ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,797 @@ ++/*! ++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits). ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the ++ * real part and 8 bits the imaginary part), and accumulates the result ++ * in 32 bits single point values, returning float32 values: ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 16 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Very Early values are calculated by multiplying the input signal in BB by the ++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Very Late values are calculated by multiplying the input signal in BB by the ++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * ++ * ------------------------------------------------------------------------- ++ * Bits analysis ++ * ++ * input = 8 bits ++ * carrier = 8 bits ++ * XX_code = 8 bits ++ * XX_out = 8 bits ++ * bb_signal_sample = 8 bits ++ * ++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) ++ * ++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between —7 and 7 to avoid overflow (3 bits) ++ * ++ * conclusion = input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 to avoid overflow (3 bits) ++ * If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 to avoid overflow (2 bits). ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; ++ __m128 output_ps; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ *VE_out_ptr = 0; ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ *VL_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ VE_code_acc = _mm_setzero_ps(); ++ E_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get very early values ++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); ++ ++ //Get very late values ++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++ ++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *VE_out_ptr += VE_dotProductVector[i]; ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *VL_out_ptr += VL_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 output_ps_1, output_ps_2; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ *VE_out_ptr = 0; ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ *VL_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ VE_code_acc = _mm_setzero_ps(); ++ E_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get very early values ++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++ ++ //Get very late values ++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++ ++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *VE_out_ptr += VE_dotProductVector[i]; ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *VL_out_ptr += VL_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ lv_8sc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ // perform very early, Early, Prompt, Late and very late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ ++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; ++ __m128 output_ps; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ *VE_out_ptr = 0; ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ *VL_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ VE_code_acc = _mm_setzero_ps(); ++ E_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_load_si128((__m128i*)input_ptr); ++ y = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get very early values ++ y = _mm_load_si128((__m128i*)VE_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); ++ ++ //Get early values ++ y = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); ++ ++ //Get prompt values ++ y = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); ++ ++ //Get late values ++ y = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); ++ ++ //Get very late values ++ y = _mm_load_si128((__m128i*)VL_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++ ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++ ++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *VE_out_ptr += VE_dotProductVector[i]; ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *VL_out_ptr += VL_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE2 ++#include "emmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 output_ps_1, output_ps_2; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ *VE_out_ptr = 0; ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ *VL_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ VE_code_acc = _mm_setzero_ps(); ++ E_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_load_si128((__m128i*)input_ptr); ++ y = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++ ++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++ ++ //Get very early values ++ y = _mm_load_si128((__m128i*)VE_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); ++ ++ //Get early values ++ y = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++ ++ //Get prompt values ++ y = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++ ++ //Get late values ++ y = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++ ++ //Get very late values ++ y = _mm_load_si128((__m128i*)VL_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++ ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++ ++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *VE_out_ptr += VE_dotProductVector[i]; ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *VL_out_ptr += VL_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ lv_8sc_t bb_signal_sample; ++ ++ bb_signal_sample = lv_cmake(0, 0); ++ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ // perform very early, Early, Prompt, Late and very late correlation ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ ++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H */ +\ No newline at end of file +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,1520 @@ ++/*! ++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the ++ * real part and 8 bits the imaginary part), and accumulates the result ++ * in 32 bits single point values, returning float32 values: ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 16 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Very Early values are calculated by multiplying the input signal in BB by the ++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Very Late values are calculated by multiplying the input signal in BB by the ++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++ ++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 output_ps_1, output_ps_2; ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ *VE_out_ptr = 0; ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ *VL_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ VE_code_acc = _mm_setzero_ps(); ++ E_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ imagx = _mm_srli_si128 (x, 1); ++ imagx = _mm_and_si128 (imagx, mult1); ++ realx = _mm_and_si128 (x, mult1); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++ ++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ //Get very early values ++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++ ++ //Get very late values ++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ ++ imagy = _mm_srli_si128 (y, 1); ++ imagy = _mm_and_si128 (imagy, mult1); ++ realy = _mm_and_si128 (y, mult1); ++ ++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++ ++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++ ++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *VE_out_ptr += VE_dotProductVector[i]; ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *VL_out_ptr += VL_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++ __m128i mult1, output, real_output, imag_output; ++ ++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 output_ps_1, output_ps_2; ++ ++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ *VE_out_ptr = 0; ++ *E_out_ptr = 0; ++ *P_out_ptr = 0; ++ *L_out_ptr = 0; ++ *VL_out_ptr = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ VE_code_acc = _mm_setzero_ps(); ++ E_code_acc = _mm_setzero_ps(); ++ P_code_acc = _mm_setzero_ps(); ++ L_code_acc = _mm_setzero_ps(); ++ VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ x_abs = _mm_abs_epi8 (x); ++ ++ y_aux = _mm_sign_epi8 (y, x); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++ y_aux = _mm_sign_epi8 (y_aux, x); ++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++ ++ //Get very early values ++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); ++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++ ++ //Get very late values ++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++ ++ input_i_1 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ input_i_2 = _mm_cvtepi8_epi32(output); ++ output = _mm_srli_si128 (output, 4); ++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++ ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); ++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++ ++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<2; ++i) ++ { ++ *VE_out_ptr += VE_dotProductVector[i]; ++ *E_out_ptr += E_dotProductVector[i]; ++ *P_out_ptr += P_dotProductVector[i]; ++ *L_out_ptr += L_dotProductVector[i]; ++ *VL_out_ptr += VL_dotProductVector[i]; ++ } ++ } ++ ++ lv_8sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++ __m128i mult1, real_output, imag_output; ++ ++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ ++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++ __m128i rearrange_sequence = _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ float VE_out_real = 0; ++ float VE_out_imag = 0; ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ float VL_out_real = 0; ++ float VL_out_imag = 0; ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ real_VE_code_acc = _mm_setzero_ps(); ++ imag_VE_code_acc = _mm_setzero_ps(); ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ real_VL_code_acc = _mm_setzero_ps(); ++ imag_VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ x_abs = _mm_abs_epi8 (x); ++ ++ y_aux = _mm_sign_epi8 (y, x); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, x); ++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++ ++ //Get very early values ++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ //Get very late values ++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ VE_out_real += real_VE_dotProductVector[i]; ++ VE_out_imag += imag_VE_dotProductVector[i]; ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ VL_out_real += real_VL_dotProductVector[i]; ++ VL_out_imag += imag_VL_dotProductVector[i]; ++ } ++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++ __m128i real_output, imag_output; ++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ __m128i minus128control; ++ ++ __m128i minus128 = _mm_set1_epi8 (-128); ++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ float VE_out_real = 0; ++ float VE_out_imag = 0; ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ float VL_out_real = 0; ++ float VL_out_imag = 0; ++ ++ real_VE_code_acc = _mm_setzero_ps(); ++ imag_VE_code_acc = _mm_setzero_ps(); ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ real_VL_code_acc = _mm_setzero_ps(); ++ imag_VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ x_abs = _mm_abs_epi8 (x); ++ ++ y_aux = _mm_sign_epi8 (y, x); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, x); ++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++ ++ //Get very early values ++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++ y = _mm_sub_epi8 (y, minus128control); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++ y = _mm_sub_epi8 (y, minus128control); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++ y = _mm_sub_epi8 (y, minus128control); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++ y = _mm_sub_epi8 (y, minus128control); ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ //Get very late values ++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++ y = _mm_sub_epi8 (y, minus128control); ++ ++ ++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++ ++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++ real_output = _mm_srli_si128 (real_output, 8); ++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++ ++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++ imag_output = _mm_srli_si128 (imag_output, 8); ++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++ ++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ VE_out_real += real_VE_dotProductVector[i]; ++ VE_out_imag += imag_VE_dotProductVector[i]; ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ VL_out_real += real_VL_dotProductVector[i]; ++ VL_out_imag += imag_VL_dotProductVector[i]; ++ } ++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++ ++#ifdef LV_HAVE_GENERIC ++#include ++#include ++ ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ ++ ++ lv_16sc_t VE_code_value; ++ lv_16sc_t E_code_value; ++ lv_16sc_t P_code_value; ++ lv_16sc_t L_code_value; ++ lv_16sc_t VL_code_value; ++ lv_16sc_t bb_signal_sample; ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ VE_code_value = VE_code[i]; ++ E_code_value = E_code[i]; ++ P_code_value = P_code[i]; ++ L_code_value = L_code[i]; ++ VL_code_value = VL_code[i]; ++ ++ if(lv_creal(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++ } ++ if(lv_cimag(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++ } ++ ++ if(lv_creal(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++ } ++ if(lv_cimag(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++ } ++ ++ if(lv_creal(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++ } ++ if(lv_cimag(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++ } ++ ++ if(lv_creal(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++ } ++ if(lv_cimag(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++ } ++ ++ if(lv_creal(VL_code_value) == -128) ++ { ++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); ++ } ++ if(lv_cimag(VL_code_value) == -128) ++ { ++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); ++ } ++ ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); ++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++ } ++} ++ ++#endif /* LV_HAVE_GENERIC */ ++ ++//#ifdef LV_HAVE_GENERIC ++//#include ++//#include ++//#include ++// ++//#ifndef MAX ++//#define MAX(a,b) ((a) > (b) ? a : b) ++//#endif ++// ++//#ifndef MIN ++//#define MIN(a,b) ((a) < (b) ? a : b) ++//#endif ++// ++///*! ++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++// \param input The input signal input ++// \param carrier The carrier signal input ++// \param VE_code Very Early PRN code replica input ++// \param E_code Early PRN code replica input ++// \param P_code Prompt PRN code replica input ++// \param L_code Late PRN code replica input ++// \param VL_code Very Late PRN code replica input ++// \param VE_out Very Early correlation output ++// \param E_out Early correlation output ++// \param P_out Prompt correlation output ++// \param L_out Late correlation output ++// \param VL_out Very Late correlation output ++// \param num_points The number of complex values in vectors ++// */ ++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++//{ ++// *VE_out = 0; ++// *E_out = 0; ++// *P_out = 0; ++// *L_out = 0; ++// *VL_out = 0; ++// ++// lv_16sc_t VE_out16; ++// lv_16sc_t E_out16; ++// lv_16sc_t P_out16; ++// lv_16sc_t L_out16; ++// lv_16sc_t VL_out16; ++// ++// int32_t max = 32767; ++// int32_t min = -32768; ++// ++// int16_t real_real; ++// int16_t imag_imag; ++// int16_t real_imag; ++// int16_t imag_real; ++// int32_t out_real_32; ++// int32_t out_imag_32; ++// int16_t out_real_16; ++// int16_t out_imag_16; ++// int16_t aux1; ++// int16_t aux2; ++// ++// ++// lv_8sc_t bb_signal_sample = lv_cmake(0, 0); ++// ++// // perform very early, Early, Prompt, Late and very late correlation ++// for(int i=0; i < num_points; ++i) ++// { ++// //Perform the carrier wipe-off ++// bb_signal_sample = input[i] * carrier[i]; ++// ++// aux1 = (int16_t)lv_creal(bb_signal_sample); ++// aux2 = (int16_t)lv_creal(VE_code[i]); ++// real_real = aux1*aux2; ++// aux1 = (int16_t)lv_cimag(bb_signal_sample); ++// aux2 = (int16_t)lv_cimag(VE_code[i]); ++// imag_imag = aux1*aux2; ++// aux1 = (int16_t)lv_creal(bb_signal_sample); ++// aux2 = (int16_t)lv_cimag(VE_code[i]); ++// real_imag = aux1*aux2; ++// aux1 = (int16_t)lv_cimag(bb_signal_sample); ++// aux2 = (int16_t)lv_creal(VE_code[i]); ++// imag_real = aux1*aux2; ++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag; ++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real; ++// out_real_16 = MIN(MAX(out_real_32, min), max); ++// out_imag_16 = MIN(MAX(out_imag_32, min), max); ++// VE_out16 = lv_cmake(out_real_16, out_imag_16); ++// ++// ++// ++// if(lv_creal(L_code[i]) == -128) ++// { ++// int8_t* L_pointer = (int8_t*)&L_code[i]; ++// *L_pointer = -127; ++// } ++// if(lv_cimag(L_code[i]) == -128) ++// { ++// int8_t* L_pointer = (int8_t*)&L_code[i]; ++// L_pointer++; ++// *L_pointer = -127; ++// } ++// aux1 = (int16_t)lv_creal(bb_signal_sample); ++// aux2 = (int16_t)lv_creal(L_code[i]); ++// real_real = aux1*aux2; ++// aux1 = (int16_t)lv_cimag(bb_signal_sample); ++// aux2 = (int16_t)lv_cimag(L_code[i]); ++// imag_imag = aux1*aux2; ++// aux1 = (int16_t)lv_creal(bb_signal_sample); ++// aux2 = (int16_t)lv_cimag(L_code[i]); ++// real_imag = aux1*aux2; ++// aux1 = (int16_t)lv_cimag(bb_signal_sample); ++// aux2 = (int16_t)lv_creal(L_code[i]); ++// imag_real = aux1*aux2; ++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag; ++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real; ++// out_real_16 = MIN(MAX(out_real_32, min), max); ++// out_imag_16 = MIN(MAX(out_imag_32, min), max); ++// L_out16 = lv_cmake(out_real_16, out_imag_16); ++// ++// E_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)E_code[i]; ++// P_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)P_code[i]; ++// VL_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)VL_code[i]; ++// ++// ++// *VE_out += (lv_32fc_t) VE_out16; ++// *E_out += (lv_32fc_t) E_out16; ++// *P_out += (lv_32fc_t) P_out16; ++// *L_out += (lv_32fc_t) L_out16; ++// *VL_out += (lv_32fc_t) VL_out16; ++// ++// //error en la parte real de L con 32 muestras ++// //*L_out = lv_cmake(12, 12); ++// } ++//} ++// ++//#endif /* LV_HAVE_GENERIC */ ++ ++//#ifdef LV_HAVE_GENERIC ++///*! ++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++// \param input The input signal input ++// \param carrier The carrier signal input ++// \param VE_code Very Early PRN code replica input ++// \param E_code Early PRN code replica input ++// \param P_code Prompt PRN code replica input ++// \param L_code Late PRN code replica input ++// \param VL_code Very Late PRN code replica input ++// \param VE_out Very Early correlation output ++// \param E_out Early correlation output ++// \param P_out Prompt correlation output ++// \param L_out Late correlation output ++// \param VL_out Very Late correlation output ++// \param num_points The number of complex values in vectors ++// */ ++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++//{ ++// lv_8sc_t bb_signal_sample; ++// ++// bb_signal_sample = lv_cmake(0, 0); ++// ++// *VE_out = 0; ++// *E_out = 0; ++// *P_out = 0; ++// *L_out = 0; ++// *VL_out = 0; ++// // perform very early, Early, Prompt, Late and very late correlation ++// for(int i=0; i < num_points; ++i) ++// { ++// //Perform the carrier wipe-off ++// bb_signal_sample = input[i] * carrier[i]; ++// ++// *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++// *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++// *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++// *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++// *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++// } ++//} ++// ++//#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H */ +\ No newline at end of file +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,772 @@ ++/*! ++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the ++ * real part and 8 bits the imaginary part), and accumulates the result ++ * in 32 bits single point values, returning float32 values: ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 16 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Very Early values are calculated by multiplying the input signal in BB by the ++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Very Late values are calculated by multiplying the input signal in BB by the ++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * ++ * ------------------------------------------------------------------------- ++ * Bits analysis ++ * ++ * input = 8 bits ++ * carrier = 8 bits ++ * XX_code = 8 bits ++ * XX_out16 = 16 bits ++ * bb_signal_sample = 8 bits ++ * ++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) ++ * ++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits) ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++ __m128i real_output, imag_output; ++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ __m128i minus128control; ++ ++ __m128i minus128 = _mm_set1_epi8 (-128); ++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ float VE_out_real = 0; ++ float VE_out_imag = 0; ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ float VL_out_real = 0; ++ float VL_out_imag = 0; ++ ++ real_VE_code_acc = _mm_setzero_ps(); ++ imag_VE_code_acc = _mm_setzero_ps(); ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ real_VL_code_acc = _mm_setzero_ps(); ++ imag_VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ x_abs = _mm_abs_epi8 (x); ++ ++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++ ++ //Get very early values ++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ //Get very late values ++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ VE_out_real += real_VE_dotProductVector[i]; ++ VE_out_imag += imag_VE_dotProductVector[i]; ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ VL_out_real += real_VL_dotProductVector[i]; ++ VL_out_imag += imag_VL_dotProductVector[i]; ++ } ++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++ } ++ ++ if(num_points%8!=0) ++ { ++ lv_16sc_t bb_signal_sample; ++ lv_16sc_t VE_code_value; ++ lv_16sc_t E_code_value; ++ lv_16sc_t P_code_value; ++ lv_16sc_t L_code_value; ++ lv_16sc_t VL_code_value; ++ ++ for(int i=0; i < num_points%8; ++i) ++ { ++ VE_code_value = *VE_code_ptr++; ++ E_code_value = *E_code_ptr++; ++ P_code_value = *P_code_ptr++; ++ L_code_value = *L_code_ptr++; ++ VL_code_value = *VL_code_ptr++; ++ ++ if(lv_creal(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++ } ++ if(lv_cimag(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++ } ++ ++ if(lv_creal(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++ } ++ if(lv_cimag(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++ } ++ ++ if(lv_creal(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++ } ++ if(lv_cimag(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++ } ++ ++ if(lv_creal(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++ } ++ if(lv_cimag(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++ } ++ ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++ } ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++#include ++#include ++ ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ ++ lv_16sc_t VE_code_value; ++ lv_16sc_t E_code_value; ++ lv_16sc_t P_code_value; ++ lv_16sc_t L_code_value; ++ lv_16sc_t VL_code_value; ++ lv_16sc_t bb_signal_sample; ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ VE_code_value = VE_code[i]; ++ E_code_value = E_code[i]; ++ P_code_value = P_code[i]; ++ L_code_value = L_code[i]; ++ VL_code_value = VL_code[i]; ++ ++ if(lv_creal(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++ } ++ if(lv_cimag(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++ } ++ ++ if(lv_creal(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++ } ++ if(lv_cimag(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++ } ++ ++ if(lv_creal(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++ } ++ if(lv_cimag(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++ } ++ ++ if(lv_creal(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++ } ++ if(lv_cimag(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++ } ++ ++ if(lv_creal(VL_code_value) == -128) ++ { ++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); ++ } ++ if(lv_cimag(VL_code_value) == -128) ++ { ++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); ++ } ++ ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); ++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++ __m128i real_output, imag_output; ++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ __m128i minus128control; ++ ++ __m128i minus128 = _mm_set1_epi8 (-128); ++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ float VE_out_real = 0; ++ float VE_out_imag = 0; ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ float VL_out_real = 0; ++ float VL_out_imag = 0; ++ ++ real_VE_code_acc = _mm_setzero_ps(); ++ imag_VE_code_acc = _mm_setzero_ps(); ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ real_VL_code_acc = _mm_setzero_ps(); ++ imag_VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_load_si128((__m128i*)input_ptr); ++ y = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ x_abs = _mm_abs_epi8 (x); ++ ++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++ ++ //Get very early values ++ y = _mm_load_si128((__m128i*)VE_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++ ++ //Get early values ++ y = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ //Get very late values ++ y = _mm_load_si128((__m128i*)VL_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++ ++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ VE_out_real += real_VE_dotProductVector[i]; ++ VE_out_imag += imag_VE_dotProductVector[i]; ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ VL_out_real += real_VL_dotProductVector[i]; ++ VL_out_imag += imag_VL_dotProductVector[i]; ++ } ++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++ } ++ ++ if(num_points%8!=0) ++ { ++ lv_16sc_t bb_signal_sample; ++ lv_16sc_t VE_code_value; ++ lv_16sc_t E_code_value; ++ lv_16sc_t P_code_value; ++ lv_16sc_t L_code_value; ++ lv_16sc_t VL_code_value; ++ ++ for(int i=0; i < num_points%8; ++i) ++ { ++ VE_code_value = *VE_code_ptr++; ++ E_code_value = *E_code_ptr++; ++ P_code_value = *P_code_ptr++; ++ L_code_value = *L_code_ptr++; ++ VL_code_value = *VL_code_ptr++; ++ ++ if(lv_creal(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++ } ++ if(lv_cimag(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++ } ++ ++ if(lv_creal(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++ } ++ if(lv_cimag(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++ } ++ ++ if(lv_creal(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++ } ++ if(lv_cimag(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++ } ++ ++ if(lv_creal(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++ } ++ if(lv_cimag(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++ } ++ ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++ } ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++#include ++#include ++ ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ ++ lv_16sc_t VE_code_value; ++ lv_16sc_t E_code_value; ++ lv_16sc_t P_code_value; ++ lv_16sc_t L_code_value; ++ lv_16sc_t VL_code_value; ++ lv_16sc_t bb_signal_sample; ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ VE_code_value = VE_code[i]; ++ E_code_value = E_code[i]; ++ P_code_value = P_code[i]; ++ L_code_value = L_code[i]; ++ VL_code_value = VL_code[i]; ++ ++ if(lv_creal(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++ } ++ if(lv_cimag(VE_code_value) == -128) ++ { ++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++ } ++ ++ if(lv_creal(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++ } ++ if(lv_cimag(E_code_value) == -128) ++ { ++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++ } ++ ++ if(lv_creal(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++ } ++ if(lv_cimag(P_code_value) == -128) ++ { ++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++ } ++ ++ if(lv_creal(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++ } ++ if(lv_cimag(L_code_value) == -128) ++ { ++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++ } ++ ++ if(lv_creal(VL_code_value) == -128) ++ { ++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); ++ } ++ if(lv_cimag(VL_code_value) == -128) ++ { ++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); ++ } ++ ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); ++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */ +\ No newline at end of file +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,554 @@ ++/*! ++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h ++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that performs the carrier wipe-off mixing and the ++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the ++ * real part and 8 bits the imaginary part), and accumulates the result ++ * in 32 bits single point values, returning float32 values: ++ * - The carrier wipe-off is done by multiplying the input signal by the ++ * carrier (multiplication of 16 bits vectors) It returns the input ++ * signal in base band (BB) ++ * - Very Early values are calculated by multiplying the input signal in BB by the ++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Early values are calculated by multiplying the input signal in BB by the ++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Prompt values are calculated by multiplying the input signal in BB by the ++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Late values are calculated by multiplying the input signal in BB by the ++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * - Very Late values are calculated by multiplying the input signal in BB by the ++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++ * ++ * ------------------------------------------------------------------------- ++ * Bits analysis ++ * ++ * input = 8 bits ++ * carrier = 8 bits ++ * XX_code = 8 bits ++ * XX_out16 = 16 bits ++ * bb_signal_sample = 8 bits ++ * ++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) ++ * ++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits) ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++ __m128i real_output, imag_output; ++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ ++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ float VE_out_real = 0; ++ float VE_out_imag = 0; ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ float VL_out_real = 0; ++ float VL_out_imag = 0; ++ ++ real_VE_code_acc = _mm_setzero_ps(); ++ imag_VE_code_acc = _mm_setzero_ps(); ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ real_VL_code_acc = _mm_setzero_ps(); ++ imag_VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++ ++ x_abs = _mm_abs_epi8 (x); ++ ++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++ ++ //Get very early values ++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++ ++ //Get early values ++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ //Get very late values ++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++ ++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ VE_out_real += real_VE_dotProductVector[i]; ++ VE_out_imag += imag_VE_dotProductVector[i]; ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ VL_out_real += real_VL_dotProductVector[i]; ++ VL_out_imag += imag_VL_dotProductVector[i]; ++ } ++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++#include ++#include ++ ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ ++ lv_16sc_t bb_signal_sample; ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */ ++ ++ ++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H ++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_SSE4_1 ++#include "smmintrin.h" ++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++#include "CommonMacros/CommonMacros.h" ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++ __m128i real_output, imag_output; ++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++ __m128i input_i_1, input_i_2, output_i32; ++ __m128 real_output_ps, imag_output_ps; ++ ++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ ++ const lv_8sc_t* input_ptr = input; ++ const lv_8sc_t* carrier_ptr = carrier; ++ ++ const lv_8sc_t* VE_code_ptr = VE_code; ++ lv_32fc_t* VE_out_ptr = VE_out; ++ const lv_8sc_t* E_code_ptr = E_code; ++ lv_32fc_t* E_out_ptr = E_out; ++ const lv_8sc_t* P_code_ptr = P_code; ++ lv_32fc_t* P_out_ptr = P_out; ++ const lv_8sc_t* L_code_ptr = L_code; ++ lv_32fc_t* L_out_ptr = L_out; ++ const lv_8sc_t* VL_code_ptr = VL_code; ++ lv_32fc_t* VL_out_ptr = VL_out; ++ ++ float VE_out_real = 0; ++ float VE_out_imag = 0; ++ float E_out_real = 0; ++ float E_out_imag = 0; ++ float P_out_real = 0; ++ float P_out_imag = 0; ++ float L_out_real = 0; ++ float L_out_imag = 0; ++ float VL_out_real = 0; ++ float VL_out_imag = 0; ++ ++ real_VE_code_acc = _mm_setzero_ps(); ++ imag_VE_code_acc = _mm_setzero_ps(); ++ real_E_code_acc = _mm_setzero_ps(); ++ imag_E_code_acc = _mm_setzero_ps(); ++ real_P_code_acc = _mm_setzero_ps(); ++ imag_P_code_acc = _mm_setzero_ps(); ++ real_L_code_acc = _mm_setzero_ps(); ++ imag_L_code_acc = _mm_setzero_ps(); ++ real_VL_code_acc = _mm_setzero_ps(); ++ imag_VL_code_acc = _mm_setzero_ps(); ++ ++ if (sse_iters>0) ++ { ++ for(int number = 0;number < sse_iters; number++){ ++ ++ //Perform the carrier wipe-off ++ x = _mm_load_si128((__m128i*)input_ptr); ++ y = _mm_load_si128((__m128i*)carrier_ptr); ++ ++ x_abs = _mm_abs_epi8 (x); ++ ++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) ++ ++ imag_output = _mm_slli_si128 (imag_output, 1); ++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++ ++ //Get very early values ++ y = _mm_load_si128((__m128i*)VE_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++ ++ //Get early values ++ y = _mm_load_si128((__m128i*)E_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++ ++ //Get prompt values ++ y = _mm_load_si128((__m128i*)P_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++ ++ //Get late values ++ y = _mm_load_si128((__m128i*)L_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++ ++ //Get very late values ++ y = _mm_load_si128((__m128i*)VL_code_ptr); ++ ++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++ ++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++ ++ input_ptr += 8; ++ carrier_ptr += 8; ++ VE_code_ptr += 8; ++ E_code_ptr += 8; ++ P_code_ptr += 8; ++ L_code_ptr += 8; ++ VL_code_ptr += 8; ++ } ++ ++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++ ++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++ ++ for (int i = 0; i<4; ++i) ++ { ++ VE_out_real += real_VE_dotProductVector[i]; ++ VE_out_imag += imag_VE_dotProductVector[i]; ++ E_out_real += real_E_dotProductVector[i]; ++ E_out_imag += imag_E_dotProductVector[i]; ++ P_out_real += real_P_dotProductVector[i]; ++ P_out_imag += imag_P_dotProductVector[i]; ++ L_out_real += real_L_dotProductVector[i]; ++ L_out_imag += imag_L_dotProductVector[i]; ++ VL_out_real += real_VL_dotProductVector[i]; ++ VL_out_imag += imag_VL_dotProductVector[i]; ++ } ++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++ } ++ ++ lv_16sc_t bb_signal_sample; ++ for(int i=0; i < num_points%8; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); ++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); ++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); ++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); ++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); ++ } ++} ++#endif /* LV_HAVE_SSE4_1 */ ++ ++#ifdef LV_HAVE_GENERIC ++#include ++#include ++ ++/*! ++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++ \param input The input signal input ++ \param carrier The carrier signal input ++ \param VE_code Very Early PRN code replica input ++ \param E_code Early PRN code replica input ++ \param P_code Prompt PRN code replica input ++ \param L_code Late PRN code replica input ++ \param VL_code Very Late PRN code replica input ++ \param VE_out Very Early correlation output ++ \param E_out Early correlation output ++ \param P_out Prompt correlation output ++ \param L_out Late correlation output ++ \param VL_out Very Late correlation output ++ \param num_points The number of complex values in vectors ++ */ ++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++{ ++ *VE_out = 0; ++ *E_out = 0; ++ *P_out = 0; ++ *L_out = 0; ++ *VL_out = 0; ++ ++ lv_16sc_t bb_signal_sample; ++ ++ for(int i=0; i < num_points; ++i) ++ { ++ //Perform the carrier wipe-off ++ bb_signal_sample = input[i] * carrier[i]; ++ // Now get very early, early, prompt, late and very late values for each ++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */ +\ No newline at end of file +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,210 @@ ++/*! ++ * \file volk_gnsssdr_8u_x2_multiply_8u.h ++ * \brief Volk protokernel: multiplies unsigned char values ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that multiplies unsigned char values (8 bits data) ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H ++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H ++ ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++#include ++/*! ++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++ \param cChar The unsigned char where the results will be stored ++ \param aChar One of the unsigned char to be multiplied ++ \param bChar One of the unsigned char to be multiplied ++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++ */ ++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 16; ++ ++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; ++ unsigned char* c = cChar; ++ const unsigned char* a = aChar; ++ const unsigned char* b = bChar; ++ ++ for(int number = 0;number < sse_iters; number++){ ++ x = _mm_lddqu_si128((__m128i*)a); ++ y = _mm_lddqu_si128((__m128i*)b); ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ x1 = _mm_srli_si128 (x, 1); ++ x1 = _mm_and_si128 (x1, mult1); ++ x2 = _mm_and_si128 (x, mult1); ++ ++ y1 = _mm_srli_si128 (y, 1); ++ y1 = _mm_and_si128 (y1, mult1); ++ y2 = _mm_and_si128 (y, mult1); ++ ++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1); ++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2); ++ ++ tmp = _mm_and_si128 (x1_mult_y1, mult1); ++ tmp1 = _mm_slli_si128 (tmp, 1); ++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1); ++ totalc = _mm_or_si128 (tmp1, tmp2); ++ ++ _mm_storeu_si128((__m128i*)c, totalc); ++ ++ a += 16; ++ b += 16; ++ c += 16; ++ } ++ ++ for (int i = 0; i<(num_points % 16); ++i) ++ { ++ *c++ = (*a++) * (*b++); ++ } ++} ++#endif /* LV_HAVE_SSE3 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++ \param cChar The unsigned char where the results will be stored ++ \param aChar One of the unsigned char to be multiplied ++ \param bChar One of the unsigned char to be multiplied ++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++ */ ++static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ ++ unsigned char* cPtr = cChar; ++ const unsigned char* aPtr = aChar; ++ const unsigned char* bPtr = bChar; ++ ++ for(int number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) * (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H ++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H ++ ++#include ++#include ++ ++#ifdef LV_HAVE_SSE3 ++#include ++#include ++/*! ++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++ \param cChar The unsigned char where the results will be stored ++ \param aChar One of the unsigned char to be multiplied ++ \param bChar One of the unsigned char to be multiplied ++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++ */ ++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ ++ ++ const unsigned int sse_iters = num_points / 16; ++ ++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; ++ unsigned char* c = cChar; ++ const unsigned char* a = aChar; ++ const unsigned char* b = bChar; ++ ++ for(int number = 0;number < sse_iters; number++){ ++ x = _mm_load_si128((__m128i*)a); ++ y = _mm_load_si128((__m128i*)b); ++ ++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++ x1 = _mm_srli_si128 (x, 1); ++ x1 = _mm_and_si128 (x1, mult1); ++ x2 = _mm_and_si128 (x, mult1); ++ ++ y1 = _mm_srli_si128 (y, 1); ++ y1 = _mm_and_si128 (y1, mult1); ++ y2 = _mm_and_si128 (y, mult1); ++ ++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1); ++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2); ++ ++ tmp = _mm_and_si128 (x1_mult_y1, mult1); ++ tmp1 = _mm_slli_si128 (tmp, 1); ++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1); ++ totalc = _mm_or_si128 (tmp1, tmp2); ++ ++ _mm_store_si128((__m128i*)c, totalc); ++ ++ a += 16; ++ b += 16; ++ c += 16; ++ } ++ ++ for (int i = 0; i<(num_points % 16); ++i) ++ { ++ *c++ = (*a++) * (*b++); ++ } ++} ++#endif /* LV_HAVE_SSE */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++ \param cChar The unsigned char where the results will be stored ++ \param aChar One of the unsigned char to be multiplied ++ \param bChar One of the unsigned char to be multiplied ++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++ */ ++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ ++ unsigned char* cPtr = cChar; ++ const unsigned char* aPtr = aChar; ++ const unsigned char* bPtr = bChar; ++ ++ for(int number = 0; number < num_points; number++){ ++ *cPtr++ = (*aPtr++) * (*bPtr++); ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++ ++#ifdef LV_HAVE_ORC ++/*! ++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++ \param cChar The unsigned char where the results will be stored ++ \param aChar One of the unsigned char to be multiplied ++ \param bChar One of the unsigned char to be multiplied ++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++ */ ++extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points); ++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){ ++ volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points); ++} ++#endif /* LV_HAVE_ORC */ ++ ++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H */ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h +--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 2014-10-17 01:53:55.000000000 +0200 +@@ -0,0 +1,866 @@ ++/*! ++ * \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc ++ * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. ++ * \authors
    ++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++ *
++ * ++ * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2007 Julien Pommier ++ * ++ * This software is provided 'as-is', without any express or implied ++ * warranty. In no event will the authors be held liable for any damages ++ * arising from the use of this software. ++ * ++ * Permission is granted to anyone to use this software for any purpose, ++ * including commercial applications, and to alter it and redistribute it ++ * freely, subject to the following restrictions: ++ * ++ * 1. The origin of this software must not be misrepresented; you must not ++ * claim that you wrote the original software. If you use this software ++ * in a product, an acknowledgment in the product documentation would be ++ * appreciated but is not required. ++ * 2. Altered source versions must be plainly marked as such, and must not be ++ * misrepresented as being the original software. ++ * 3. This notice may not be removed or altered from any source distribution. ++ * ++ *(this is the zlib license) ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2012 Giovanni Garberoglio ++ * Interdisciplinary Laboratory for Computational Science (LISC) ++ * Fondazione Bruno Kessler and University of Trento ++ * via Sommarive, 18 ++ * I-38123 Trento (Italy) ++ * ++ * ------------------------------------------------------------------------- ++ * ++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++ * ++ * GNSS-SDR is a software defined Global Navigation ++ * Satellite Systems receiver ++ * ++ * This file is part of GNSS-SDR. ++ * ++ * GNSS-SDR is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * at your option) any later version. ++ * ++ * GNSS-SDR is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with GNSS-SDR. If not, see . ++ * ++ * ------------------------------------------------------------------------- ++ */ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H ++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++ ++// float* pointer1 = (float*)&phase_rad_init; ++// *pointer1 = 0; ++// float* pointer2 = (float*)&phase_step_rad; ++// *pointer2 = 0.5; ++ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); ++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); ++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); ++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f); ++ __m128i _pi32avx_1 = _mm_set1_epi32(1); ++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1); ++ __m128i _pi32avx_2 = _mm_set1_epi32(2); ++ __m128i _pi32avx_4 = _mm_set1_epi32(4); ++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI ++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f); ++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f); ++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f); ++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f); ++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f); ++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); ++ __m256 _ps256_1 = _mm256_set1_ps(1.f); ++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); ++ ++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); ++ ++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; ++ __m256 xmm1, xmm2, xmm3, sign_bit_sin; ++ __m256i imm0, imm2, imm4; ++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; ++ __VOLK_ATTR_ALIGNED(32) float sin_value[8]; ++ __VOLK_ATTR_ALIGNED(32) float cos_value[8]; ++ ++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); ++ ++ for(int i = 0; i < sse_iters; i++) ++ { ++ ++ x = phase_rad_array; ++ ++ /* extract the sign bit (upper one) */ ++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); ++ ++ /* take the absolute value */ ++ x = _mm256_xor_ps(x, sign_bit_sin); ++ ++ /* scale by 4/Pi */ ++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI); ++ ++ /* we use SSE2 routines to perform the integer ops */ ++ ++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); ++ y = _mm256_cvttps_epi32(y); ++ imm2_1 = _mm256_extractf128_ps (y, 0); ++ imm2_2 = _mm256_extractf128_ps (y, 1); ++ ++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); ++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); ++ ++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); ++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); ++ ++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); ++ //_mm256_set_m128i not defined in some versions of immintrin.h ++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); ++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); ++ ++ y = _mm256_cvtepi32_ps(imm2); ++ ++ imm4_1 = imm2_1; ++ imm4_2 = imm2_2; ++ ++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); ++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); ++ ++ imm0_1 = _mm_slli_epi32(imm0_1, 29); ++ imm0_2 = _mm_slli_epi32(imm0_2, 29); ++ ++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); ++ //_mm256_set_m128i not defined in some versions of immintrin.h ++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1); ++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); ++ ++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); ++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); ++ ++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); ++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); ++ ++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); ++ //_mm256_set_m128i not defined in some versions of immintrin.h ++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); ++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); ++ ++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0); ++ poly_mask = _mm256_castsi256_ps(imm2); ++ ++ /* The magic pass: "Extended precision modular arithmetic" ++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ ++ xmm1 = _ps256_minus_cephes_DP1; ++ xmm2 = _ps256_minus_cephes_DP2; ++ xmm3 = _ps256_minus_cephes_DP3; ++ xmm1 = _mm256_mul_ps(y, xmm1); ++ xmm2 = _mm256_mul_ps(y, xmm2); ++ xmm3 = _mm256_mul_ps(y, xmm3); ++ x = _mm256_add_ps(x, xmm1); ++ x = _mm256_add_ps(x, xmm2); ++ x = _mm256_add_ps(x, xmm3); ++ ++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); ++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); ++ ++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); ++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); ++ ++ imm4_1 = _mm_slli_epi32(imm4_1, 29); ++ imm4_2 = _mm_slli_epi32(imm4_2, 29); ++ ++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); ++ //_mm256_set_m128i not defined in some versions of immintrin.h ++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1); ++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); ++ ++ sign_bit_cos = _mm256_castsi256_ps(imm4); ++ ++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); ++ ++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ ++ z = _mm256_mul_ps(x,x); ++ y = _ps256_coscof_p0; ++ ++ y = _mm256_mul_ps(y, z); ++ y = _mm256_add_ps(y, _ps256_coscof_p1); ++ y = _mm256_mul_ps(y, z); ++ y = _mm256_add_ps(y, _ps256_coscof_p2); ++ y = _mm256_mul_ps(y, z); ++ y = _mm256_mul_ps(y, z); ++ tmp = _mm256_mul_ps(z, _ps256_0p5); ++ y = _mm256_sub_ps(y, tmp); ++ y = _mm256_add_ps(y, _ps256_1); ++ ++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ ++ ++ y2 = _ps256_sincof_p0; ++ y2 = _mm256_mul_ps(y2, z); ++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1); ++ y2 = _mm256_mul_ps(y2, z); ++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2); ++ y2 = _mm256_mul_ps(y2, z); ++ y2 = _mm256_mul_ps(y2, x); ++ y2 = _mm256_add_ps(y2, x); ++ ++ /* select the correct result from the two polynoms */ ++ xmm3 = poly_mask; ++ ysin2 = _mm256_and_ps(xmm3, y2); ++ ysin1 = _mm256_andnot_ps(xmm3, y); ++ y2 = _mm256_sub_ps(y2,ysin2); ++ y = _mm256_sub_ps(y, ysin1); ++ ++ xmm1 = _mm256_add_ps(ysin1,ysin2); ++ xmm2 = _mm256_add_ps(y,y2); ++ ++ /* update the sign */ ++ s = _mm256_xor_ps(xmm1, sign_bit_sin); ++ c = _mm256_xor_ps(xmm2, sign_bit_cos); ++ ++ //GNSS-SDR needs to return -sin ++ s = _mm256_xor_ps(s, _ps256_sign_mask); ++ ++ _mm256_storeu_ps ((float*)sin_value, s); ++ _mm256_storeu_ps ((float*)cos_value, c); ++ ++ for(int i = 0; i < 8; i++) ++ { ++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); ++ } ++ d_carr_sign += 8; ++ ++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); ++ } ++ ++ if (num_points%8!=0) ++ { ++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; ++ _mm256_storeu_si256 ((float*)phase_rad_store, phase_rad_array); ++ ++ float phase_rad = phase_rad_store[0]; ++ ++ for(int i = 0; i < num_points%8; i++) ++ { ++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++ d_carr_sign++; ++ phase_rad += phase_step_rad; ++ } ++ } ++} ++#endif /* LV_HAVE_AVX */ ++ ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++*/ ++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++ ++// float* pointer1 = (float*)&phase_rad_init; ++// *pointer1 = 0; ++// float* pointer2 = (float*)&phase_step_rad; ++// *pointer2 = 0.5; ++ ++ const unsigned int sse_iters = num_points / 4; ++ ++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); ++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); ++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); ++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f); ++ __m128i _pi32_1 = _mm_set1_epi32(1); ++ __m128i _pi32_inv1 = _mm_set1_epi32(~1); ++ __m128i _pi32_2 = _mm_set1_epi32(2); ++ __m128i _pi32_4 = _mm_set1_epi32(4); ++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI ++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f); ++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f); ++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f); ++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f); ++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f); ++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); ++ __m128 _ps_1 = _mm_set1_ps(1.f); ++ __m128 _ps_0p5 = _mm_set1_ps(0.5f); ++ ++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); ++ ++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; ++ __m128 xmm1, xmm2, xmm3, sign_bit_sin; ++ __m128i emm0, emm2, emm4; ++ __VOLK_ATTR_ALIGNED(16) float sin_value[4]; ++ __VOLK_ATTR_ALIGNED(16) float cos_value[4]; ++ ++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); ++ ++ for(int i = 0; i < sse_iters; i++) ++ { ++ x = phase_rad_array; ++ ++ /* extract the sign bit (upper one) */ ++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); ++ ++ /* take the absolute value */ ++ x = _mm_xor_ps(x, sign_bit_sin); ++ ++ /* scale by 4/Pi */ ++ y = _mm_mul_ps(x, _ps_cephes_FOPI); ++ ++ /* store the integer part of y in emm2 */ ++ emm2 = _mm_cvttps_epi32(y); ++ ++ /* j=(j+1) & (~1) (see the cephes sources) */ ++ emm2 = _mm_add_epi32(emm2, _pi32_1); ++ emm2 = _mm_and_si128(emm2, _pi32_inv1); ++ y = _mm_cvtepi32_ps(emm2); ++ ++ emm4 = emm2; ++ ++ /* get the swap sign flag for the sine */ ++ emm0 = _mm_and_si128(emm2, _pi32_4); ++ emm0 = _mm_slli_epi32(emm0, 29); ++ swap_sign_bit_sin = _mm_castsi128_ps(emm0); ++ ++ /* get the polynom selection mask for the sine*/ ++ emm2 = _mm_and_si128(emm2, _pi32_2); ++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); ++ poly_mask = _mm_castsi128_ps(emm2); ++ ++ /* The magic pass: "Extended precision modular arithmetic" ++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ ++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); ++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); ++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); ++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); ++ ++ emm4 = _mm_sub_epi32(emm4, _pi32_2); ++ emm4 = _mm_andnot_si128(emm4, _pi32_4); ++ emm4 = _mm_slli_epi32(emm4, 29); ++ sign_bit_cos = _mm_castsi128_ps(emm4); ++ ++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); ++ ++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ ++ z = _mm_mul_ps(x,x); ++ y = _ps_coscof_p0; ++ y = _mm_mul_ps(y, z); ++ y = _mm_add_ps(y, _ps_coscof_p1); ++ y = _mm_mul_ps(y, z); ++ y = _mm_add_ps(y, _ps_coscof_p2); ++ y = _mm_mul_ps(y, _mm_mul_ps(z, z)); ++ tmp = _mm_mul_ps(z, _ps_0p5); ++ y = _mm_sub_ps(y, tmp); ++ y = _mm_add_ps(y, _ps_1); ++ ++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ ++ y2 = _ps_sincof_p0; ++ y2 = _mm_mul_ps(y2, z); ++ y2 = _mm_add_ps(y2, _ps_sincof_p1); ++ y2 = _mm_mul_ps(y2, z); ++ y2 = _mm_add_ps(y2, _ps_sincof_p2); ++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); ++ y2 = _mm_add_ps(y2, x); ++ ++ /* select the correct result from the two polynoms */ ++ xmm3 = poly_mask; ++ ysin2 = _mm_and_ps(xmm3, y2); ++ ysin1 = _mm_andnot_ps(xmm3, y); ++ y2 = _mm_sub_ps(y2,ysin2); ++ y = _mm_sub_ps(y, ysin1); ++ ++ xmm1 = _mm_add_ps(ysin1,ysin2); ++ xmm2 = _mm_add_ps(y,y2); ++ ++ /* update the sign */ ++ s = _mm_xor_ps(xmm1, sign_bit_sin); ++ c = _mm_xor_ps(xmm2, sign_bit_cos); ++ ++ //GNSS-SDR needs to return -sin ++ s = _mm_xor_ps(s, _ps_sign_mask); ++ ++ _mm_storeu_ps ((float*)sin_value, s); ++ _mm_storeu_ps ((float*)cos_value, c); ++ ++ for(int i = 0; i < 4; i++) ++ { ++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); ++ } ++ d_carr_sign += 4; ++ ++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); ++ } ++ ++ if (num_points%4!=0) ++ { ++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; ++ _mm_storeu_si128 ((__m128i*)phase_rad_store, phase_rad_array); ++ ++ float phase_rad = phase_rad_store[0]; ++ ++ for(int i = 0; i < num_points%4; i++) ++ { ++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++ d_carr_sign++; ++ phase_rad += phase_step_rad; ++ } ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++*/ ++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++ ++// float* pointer1 = (float*)&phase_rad_init; ++// *pointer1 = 0; ++// float* pointer2 = (float*)&phase_step_rad; ++// *pointer2 = 0.5; ++ ++ float phase_rad = phase_rad_init; ++ for(int i = 0; i < num_points; i++) ++ { ++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++ d_carr_sign++; ++ phase_rad += phase_step_rad; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */ ++ ++ ++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H ++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H ++ ++#include ++#include ++#include ++ ++#ifdef LV_HAVE_AVX ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++ ++ // float* pointer1 = (float*)&phase_rad_init; ++ // *pointer1 = 0; ++ // float* pointer2 = (float*)&phase_step_rad; ++ // *pointer2 = 0.5; ++ ++ const unsigned int sse_iters = num_points / 8; ++ ++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); ++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); ++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); ++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f); ++ __m128i _pi32avx_1 = _mm_set1_epi32(1); ++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1); ++ __m128i _pi32avx_2 = _mm_set1_epi32(2); ++ __m128i _pi32avx_4 = _mm_set1_epi32(4); ++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI ++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f); ++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f); ++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f); ++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f); ++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f); ++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); ++ __m256 _ps256_1 = _mm256_set1_ps(1.f); ++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); ++ ++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); ++ ++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; ++ __m256 xmm1, xmm2, xmm3, sign_bit_sin; ++ __m256i imm0, imm2, imm4; ++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; ++ __VOLK_ATTR_ALIGNED(32) float sin_value[8]; ++ __VOLK_ATTR_ALIGNED(32) float cos_value[8]; ++ ++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); ++ ++ for(int i = 0; i < sse_iters; i++) ++ { ++ ++ x = phase_rad_array; ++ ++ /* extract the sign bit (upper one) */ ++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); ++ ++ /* take the absolute value */ ++ x = _mm256_xor_ps(x, sign_bit_sin); ++ ++ /* scale by 4/Pi */ ++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI); ++ ++ /* we use SSE2 routines to perform the integer ops */ ++ ++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); ++ y = _mm256_cvttps_epi32(y); ++ imm2_1 = _mm256_extractf128_ps (y, 0); ++ imm2_2 = _mm256_extractf128_ps (y, 1); ++ ++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); ++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); ++ ++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); ++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); ++ ++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); ++ //_mm256_set_m128i not defined in some versions of immintrin.h ++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); ++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); ++ ++ y = _mm256_cvtepi32_ps(imm2); ++ ++ imm4_1 = imm2_1; ++ imm4_2 = imm2_2; ++ ++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); ++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); ++ ++ imm0_1 = _mm_slli_epi32(imm0_1, 29); ++ imm0_2 = _mm_slli_epi32(imm0_2, 29); ++ ++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); ++ //_mm256_set_m128i not defined in some versions of immintrin.h ++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1); ++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); ++ ++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); ++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); ++ ++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); ++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); ++ ++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); ++ //_mm256_set_m128i not defined in some versions of immintrin.h ++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); ++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); ++ ++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0); ++ poly_mask = _mm256_castsi256_ps(imm2); ++ ++ /* The magic pass: "Extended precision modular arithmetic" ++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ ++ xmm1 = _ps256_minus_cephes_DP1; ++ xmm2 = _ps256_minus_cephes_DP2; ++ xmm3 = _ps256_minus_cephes_DP3; ++ xmm1 = _mm256_mul_ps(y, xmm1); ++ xmm2 = _mm256_mul_ps(y, xmm2); ++ xmm3 = _mm256_mul_ps(y, xmm3); ++ x = _mm256_add_ps(x, xmm1); ++ x = _mm256_add_ps(x, xmm2); ++ x = _mm256_add_ps(x, xmm3); ++ ++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); ++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); ++ ++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); ++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); ++ ++ imm4_1 = _mm_slli_epi32(imm4_1, 29); ++ imm4_2 = _mm_slli_epi32(imm4_2, 29); ++ ++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); ++ //_mm256_set_m128i not defined in some versions of immintrin.h ++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1); ++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); ++ ++ sign_bit_cos = _mm256_castsi256_ps(imm4); ++ ++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); ++ ++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ ++ z = _mm256_mul_ps(x,x); ++ y = _ps256_coscof_p0; ++ ++ y = _mm256_mul_ps(y, z); ++ y = _mm256_add_ps(y, _ps256_coscof_p1); ++ y = _mm256_mul_ps(y, z); ++ y = _mm256_add_ps(y, _ps256_coscof_p2); ++ y = _mm256_mul_ps(y, z); ++ y = _mm256_mul_ps(y, z); ++ tmp = _mm256_mul_ps(z, _ps256_0p5); ++ y = _mm256_sub_ps(y, tmp); ++ y = _mm256_add_ps(y, _ps256_1); ++ ++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ ++ ++ y2 = _ps256_sincof_p0; ++ y2 = _mm256_mul_ps(y2, z); ++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1); ++ y2 = _mm256_mul_ps(y2, z); ++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2); ++ y2 = _mm256_mul_ps(y2, z); ++ y2 = _mm256_mul_ps(y2, x); ++ y2 = _mm256_add_ps(y2, x); ++ ++ /* select the correct result from the two polynoms */ ++ xmm3 = poly_mask; ++ ysin2 = _mm256_and_ps(xmm3, y2); ++ ysin1 = _mm256_andnot_ps(xmm3, y); ++ y2 = _mm256_sub_ps(y2,ysin2); ++ y = _mm256_sub_ps(y, ysin1); ++ ++ xmm1 = _mm256_add_ps(ysin1,ysin2); ++ xmm2 = _mm256_add_ps(y,y2); ++ ++ /* update the sign */ ++ s = _mm256_xor_ps(xmm1, sign_bit_sin); ++ c = _mm256_xor_ps(xmm2, sign_bit_cos); ++ ++ //GNSS-SDR needs to return -sin ++ s = _mm256_xor_ps(s, _ps256_sign_mask); ++ ++ _mm256_store_ps ((float*)sin_value, s); ++ _mm256_store_ps ((float*)cos_value, c); ++ ++ for(int i = 0; i < 8; i++) ++ { ++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); ++ } ++ d_carr_sign += 8; ++ ++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); ++ } ++ ++ if (num_points%8!=0) ++ { ++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; ++ _mm256_store_ps ((float*)phase_rad_store, phase_rad_array); ++ ++ float phase_rad = phase_rad_store[0]; ++ ++ for(int i = 0; i < num_points%8; i++) ++ { ++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++ d_carr_sign++; ++ phase_rad += phase_step_rad; ++ } ++ } ++} ++#endif /* LV_HAVE_AVX */ ++ ++#ifdef LV_HAVE_SSE2 ++#include ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++ ++// float* pointer1 = (float*)&phase_rad_init; ++// *pointer1 = 0; ++// float* pointer2 = (float*)&phase_step_rad; ++// *pointer2 = 0.5; ++ ++ const unsigned int sse_iters = num_points / 4; ++ ++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); ++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); ++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); ++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f); ++ __m128i _pi32_1 = _mm_set1_epi32(1); ++ __m128i _pi32_inv1 = _mm_set1_epi32(~1); ++ __m128i _pi32_2 = _mm_set1_epi32(2); ++ __m128i _pi32_4 = _mm_set1_epi32(4); ++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI ++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f); ++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f); ++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f); ++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f); ++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f); ++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); ++ __m128 _ps_1 = _mm_set1_ps(1.f); ++ __m128 _ps_0p5 = _mm_set1_ps(0.5f); ++ ++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); ++ ++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; ++ __m128 xmm1, xmm2, xmm3, sign_bit_sin; ++ __m128i emm0, emm2, emm4; ++ __VOLK_ATTR_ALIGNED(16) float sin_value[4]; ++ __VOLK_ATTR_ALIGNED(16) float cos_value[4]; ++ ++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); ++ ++ for(int i = 0; i < sse_iters; i++) ++ { ++ x = phase_rad_array; ++ ++ /* extract the sign bit (upper one) */ ++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); ++ ++ /* take the absolute value */ ++ x = _mm_xor_ps(x, sign_bit_sin); ++ ++ /* scale by 4/Pi */ ++ y = _mm_mul_ps(x, _ps_cephes_FOPI); ++ ++ /* store the integer part of y in emm2 */ ++ emm2 = _mm_cvttps_epi32(y); ++ ++ /* j=(j+1) & (~1) (see the cephes sources) */ ++ emm2 = _mm_add_epi32(emm2, _pi32_1); ++ emm2 = _mm_and_si128(emm2, _pi32_inv1); ++ y = _mm_cvtepi32_ps(emm2); ++ ++ emm4 = emm2; ++ ++ /* get the swap sign flag for the sine */ ++ emm0 = _mm_and_si128(emm2, _pi32_4); ++ emm0 = _mm_slli_epi32(emm0, 29); ++ swap_sign_bit_sin = _mm_castsi128_ps(emm0); ++ ++ /* get the polynom selection mask for the sine*/ ++ emm2 = _mm_and_si128(emm2, _pi32_2); ++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); ++ poly_mask = _mm_castsi128_ps(emm2); ++ ++ /* The magic pass: "Extended precision modular arithmetic" ++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ ++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); ++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); ++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); ++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); ++ ++ emm4 = _mm_sub_epi32(emm4, _pi32_2); ++ emm4 = _mm_andnot_si128(emm4, _pi32_4); ++ emm4 = _mm_slli_epi32(emm4, 29); ++ sign_bit_cos = _mm_castsi128_ps(emm4); ++ ++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); ++ ++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ ++ z = _mm_mul_ps(x,x); ++ y = _ps_coscof_p0; ++ y = _mm_mul_ps(y, z); ++ y = _mm_add_ps(y, _ps_coscof_p1); ++ y = _mm_mul_ps(y, z); ++ y = _mm_add_ps(y, _ps_coscof_p2); ++ y = _mm_mul_ps(y, _mm_mul_ps(z, z)); ++ tmp = _mm_mul_ps(z, _ps_0p5); ++ y = _mm_sub_ps(y, tmp); ++ y = _mm_add_ps(y, _ps_1); ++ ++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ ++ y2 = _ps_sincof_p0; ++ y2 = _mm_mul_ps(y2, z); ++ y2 = _mm_add_ps(y2, _ps_sincof_p1); ++ y2 = _mm_mul_ps(y2, z); ++ y2 = _mm_add_ps(y2, _ps_sincof_p2); ++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); ++ y2 = _mm_add_ps(y2, x); ++ ++ /* select the correct result from the two polynoms */ ++ xmm3 = poly_mask; ++ ysin2 = _mm_and_ps(xmm3, y2); ++ ysin1 = _mm_andnot_ps(xmm3, y); ++ y2 = _mm_sub_ps(y2,ysin2); ++ y = _mm_sub_ps(y, ysin1); ++ ++ xmm1 = _mm_add_ps(ysin1,ysin2); ++ xmm2 = _mm_add_ps(y,y2); ++ ++ /* update the sign */ ++ s = _mm_xor_ps(xmm1, sign_bit_sin); ++ c = _mm_xor_ps(xmm2, sign_bit_cos); ++ ++ //GNSS-SDR needs to return -sin ++ s = _mm_xor_ps(s, _ps_sign_mask); ++ ++ _mm_store_ps ((float*)sin_value, s); ++ _mm_store_ps ((float*)cos_value, c); ++ ++ for(int i = 0; i < 4; i++) ++ { ++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); ++ } ++ d_carr_sign += 4; ++ ++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); ++ } ++ ++ if (num_points%4!=0) ++ { ++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; ++ _mm_store_si128 ((__m128i*)phase_rad_store, phase_rad_array); ++ ++ float phase_rad = phase_rad_store[0]; ++ ++ for(int i = 0; i < num_points%4; i++) ++ { ++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++ d_carr_sign++; ++ phase_rad += phase_step_rad; ++ } ++ } ++} ++#endif /* LV_HAVE_SSE2 */ ++ ++#ifdef LV_HAVE_GENERIC ++/*! ++ \brief Accumulates the values in the input buffer ++ \param result The accumulated result ++ \param inputBuffer The buffer of data to be accumulated ++ \param num_points The number of values in inputBuffer to be accumulated ++ */ ++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++ ++// float* pointer1 = (float*)&phase_rad_init; ++// *pointer1 = 0; ++// float* pointer2 = (float*)&phase_step_rad; ++// *pointer2 = 0.5; ++ ++ float phase_rad = phase_rad_init; ++ for(int i = 0; i < num_points; i++) ++ { ++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++ d_carr_sign++; ++ phase_rad += phase_step_rad; ++ } ++} ++#endif /* LV_HAVE_GENERIC */ ++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */ ++ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt +--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 05:07:22.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-17 04:50:28.000000000 +0200 +@@ -517,7 +517,19 @@ if(MSVC) + endif() + + #create the volk_gnsssdr runtime library +-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) ++ ++#MODIFICATIONS BY GNSS-SDR ++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) ++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt) ++ ++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) ++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc}) ++ ++source_group("Kernels" FILES ${h_files}) ++source_group("Common Macros" FILES ${CommonMacros}) ++source_group("ORC Files" FILES ${orc}) ++#END OF MODIFICATIONS ++ + target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries}) + set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER}) + set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS") +diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc +--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 04:21:03.000000000 +0200 +@@ -217,6 +217,72 @@ inline void run_cast_test3_s32fc(volk_gn + while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + } + ++//ADDED BY GNSS-SDR. START ++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++} ++ ++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++} ++//ADDED BY GNSS-SDR. END ++ + // This function is a nop that helps resolve GNU Radio bugs 582 and 583. + // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast(float tol) + // won't happen on armhf (reported on cortex A9 and A15). +@@ -426,7 +492,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr + } else { + run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } +- } else throw "unsupported 1 arg function >1 scalars"; ++ } ++ //ADDED BY GNSS-SDR. START ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ //ADDED BY GNSS-SDR. END ++ else throw "unsupported 1 arg function >1 scalars"; + break; + case 2: + if(inputsc.size() == 0) { +@@ -437,7 +513,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr + } else { + run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } +- } else throw "unsupported 2 arg function >1 scalars"; ++ } ++ //ADDED BY GNSS-SDR. START ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ //ADDED BY GNSS-SDR. END ++ else throw "unsupported 2 arg function >1 scalars"; + break; + case 3: + if(inputsc.size() == 0) { +@@ -448,11 +534,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr + } else { + run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } +- } else throw "unsupported 3 arg function >1 scalars"; ++ } ++ //ADDED BY GNSS-SDR. START ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ //ADDED BY GNSS-SDR. END ++ else throw "unsupported 3 arg function >1 scalars"; + break; + case 4: + run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); + break; ++ //ADDED BY GNSS-SDR. START ++ case 8: ++ if(inputsc.size() == 0) { ++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ else throw "unsupported 8 arg function >1 scalars"; ++ break; ++ case 12: ++ if(inputsc.size() == 0) { ++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++ if(inputsc[0].is_complex) { ++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++ } else { ++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++ } ++ else throw "unsupported 12 arg function >1 scalars"; ++ break; ++ //ADDED BY GNSS-SDR. END + default: + throw "no function handler for this signature"; + break; +diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h +--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 05:07:24.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-17 04:21:51.000000000 +0200 +@@ -77,4 +77,26 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f + typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); + typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); + ++//ADDED BY GNSS-SDR. START ++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input ++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input ++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++ ++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++ ++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); ++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++//ADDED BY GNSS-SDR. END ++ ++ + #endif //VOLK_QA_UTILS_H +diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc +--- /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc 2014-10-15 01:55:08.000000000 +0200 +@@ -24,6 +24,58 @@ + #include + #include + ++//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1); ++ ++//GNSS-SDR PROTO-KERNELS ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1); ++ ++VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1); ++ ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1); ++ ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); ++ ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 20462, 1); ++ ++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 20462, 1); ++VOLK_RUN_TESTS(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 20462, 1); ++ ++ ++ ++ ++ ++ ++ + //VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); + //VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000); + //VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_16i, 0, 0, 20462, 10000); +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,5 @@ ++.function volk_gnsssdr_32f_x2_add_32f_a_orc_impl ++.dest 4 dst ++.source 4 src1 ++.source 4 src2 ++addf dst, src1, src2 +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,18 @@ ++.function volk_gnsssdr_32fc_s32fc_multiply_32fc_a_orc_impl ++.source 8 src1 ++.floatparam 8 scalar ++.dest 8 dst ++.temp 8 iqprod ++.temp 4 real ++.temp 4 imag ++.temp 4 ac ++.temp 4 bd ++.temp 8 swapped ++x2 mulf iqprod, src1, scalar ++splitql bd, ac, iqprod ++subf real, ac, bd ++swaplq swapped, src1 ++x2 mulf iqprod, swapped, scalar ++splitql bd, ac, iqprod ++addf imag, ac, bd ++mergelq dst, real, imag +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,18 @@ ++.function volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl ++.source 8 src1 ++.source 8 src2 ++.dest 8 dst ++.temp 8 iqprod ++.temp 4 real ++.temp 4 imag ++.temp 4 ac ++.temp 4 bd ++.temp 8 swapped ++x2 mulf iqprod, src1, src2 ++splitql bd, ac, iqprod ++subf real, ac, bd ++swaplq swapped, src1 ++x2 mulf iqprod, swapped, src2 ++splitql bd, ac, iqprod ++addf imag, ac, bd ++mergelq dst, real, imag +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,40 @@ ++#/*! ++# * \file volk_gnsssdr_8i_accumulator_s8i.orc ++# * \brief ORC implementation: 8 bits (char) scalar accumulator ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that implements an accumulator of char values ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8i_accumulator_s8i_a_orc_impl ++.source 1 src1 ++.accumulator 2 acc ++.temp 2 sum ++mergebw sum, 0, src1 ++accw acc, sum +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,39 @@ ++#/*! ++# * \file volk_gnsssdr_8i_x2_add_8i.orc ++# * \brief ORC implementation: adds pairs of 8 bits (char) scalars ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that adds pairs of 8 bits (char) scalars ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8i_x2_add_8i_a_orc_impl ++.dest 1 dst ++.source 1 src1 ++.source 1 src2 ++addb dst, src1, src2 +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,42 @@ ++#/*! ++# * \file volk_gnsssdr_8ic_conjugate_8ic.orc ++# * \brief ORC implementation: calculates the conjugate of a 16 bits vector ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that calculates the conjugate of a ++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part) ++# * result = (real*real) + (imag*imag) ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl ++.source 2 src1 ++.dest 2 dst ++.temp 2 merged ++mergebw merged, 1, -1 ++x2 mullb dst, merged, src1 +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,45 @@ ++#/*! ++# * \file volk_gnsssdr_8ic_magnitude_squared_8i.orc ++# * \brief ORC implementation: calculates the magnitude squared of a 16 bits vector ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that calculates the magnitude squared of a ++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part) ++# * result = (real*real) + (imag*imag) ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl ++.source 2 src1 ++.dest 1 dst ++.temp 2 iqprod ++.temp 1 ac ++.temp 1 bd ++x2 mullb iqprod, src1, src1 ++splitwb bd, ac, iqprod ++addb dst, ac, bd +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,58 @@ ++#/*! ++# * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.orc ++# * \brief ORC implementation: multiplies a group of 16 bits vectors by one constant vector ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that multiplies a group of 16 bits vectors ++# * (8 bits the real part and 8 bits the imaginary part) by one constant vector ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl ++.source 2 src1 ++.param 2 src2real ++.param 2 src2imag ++.dest 2 dst ++.temp 2 iqprod ++.temp 1 real ++.temp 1 imag ++.temp 1 rr ++.temp 1 ii ++.temp 1 ri ++.temp 1 ir ++x2 mullb iqprod, src1, src2real ++splitwb ir, rr, iqprod ++x2 mullb iqprod, src1, src2imag ++splitwb ii, ri, iqprod ++subb real, rr, ii ++addb imag, ri, ir ++mergebw dst, real, imag ++ ++ ++ ++ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,59 @@ ++#/*! ++# * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.orc ++# * \brief ORC implementation: multiplies two 16 bits vectors and accumulates them ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that multiplies two 16 bits vectors (8 bits the real part ++# * and 8 bits the imaginary part) and accumulates them ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl ++.source 2 src1 ++.source 2 src2 ++.accumulator 2 accreal ++.accumulator 2 accimag ++.temp 2 iqprod ++.temp 1 real ++.temp 1 imag ++.temp 2 real2 ++.temp 2 imag2 ++.temp 1 ac ++.temp 1 bd ++.temp 2 swapped ++x2 mullb iqprod, src1, src2 ++splitwb bd, ac, iqprod ++subb real, ac, bd ++swapw swapped, src1 ++x2 mullb iqprod, swapped, src2 ++splitwb bd, ac, iqprod ++addb imag, ac, bd ++mergebw real2, 0, real ++accw accreal, real2 ++mergebw imag2, 0, imag ++accw accimag, imag2 +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,57 @@ ++#/*! ++# * \file volk_gnsssdr_8ic_x2_multiply_8ic.orc ++# * \brief ORC implementation: multiplies two 16 bits vectors ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that multiplies two 16 bits vectors (8 bits the real part ++# * and 8 bits the imaginary part) ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl ++.source 2 src1 ++.source 2 src2 ++.dest 2 dst ++.temp 2 iqprod ++.temp 1 real ++.temp 1 imag ++.temp 1 ac ++.temp 1 bd ++.temp 2 swapped ++x2 mullb iqprod, src1, src2 ++splitwb bd, ac, iqprod ++subb real, ac, bd ++swapw swapped, src1 ++x2 mullb iqprod, swapped, src2 ++splitwb bd, ac, iqprod ++addb imag, ac, bd ++mergebw dst, real, imag ++ ++ ++ ++ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,139 @@ ++#/*! ++# * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc ++# * \brief ORC implementation: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that performs the carrier wipe-off mixing and the ++# * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the ++# * real part and 8 bits the imaginary part): ++# * - The carrier wipe-off is done by multiplying the input signal by the ++# * carrier (multiplication of 16 bits vectors) It returns the input ++# * signal in base band (BB) ++# * - Early values are calculated by multiplying the input signal in BB by the ++# * early code (multiplication of 16 bits vectors), accumulating the results ++# * - Prompt values are calculated by multiplying the input signal in BB by the ++# * prompt code (multiplication of 16 bits vectors), accumulating the results ++# * - Late values are calculated by multiplying the input signal in BB by the ++# * late code (multiplication of 16 bits vectors), accumulating the results ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl ++.source 2 input ++.source 2 carrier ++.source 2 E_code ++.source 2 P_code ++.accumulator 2 E_out_real ++.accumulator 2 E_out_imag ++.accumulator 2 P_out_real ++.accumulator 2 P_out_imag ++.temp 2 bb_signal_sample ++.temp 2 iqprod ++.temp 1 real ++.temp 1 imag ++.temp 1 ac ++.temp 1 bd ++.temp 2 swapped ++ ++.temp 2 real2 ++.temp 2 imag2 ++ ++x2 mullb iqprod, input, carrier ++splitwb bd, ac, iqprod ++subb real, ac, bd ++swapw swapped, input ++x2 mullb iqprod, swapped, carrier ++splitwb bd, ac, iqprod ++addb imag, ac, bd ++mergebw bb_signal_sample, real, imag ++ ++swapw swapped, bb_signal_sample ++ ++x2 mullb iqprod, bb_signal_sample, E_code ++splitwb bd, ac, iqprod ++subb real, ac, bd ++x2 mullb iqprod, swapped, E_code ++splitwb bd, ac, iqprod ++addb imag, ac, bd ++mergebw real2, 0, real ++mergebw imag2, 0, imag ++accw E_out_real, real2 ++accw E_out_imag, imag2 ++ ++x2 mullb iqprod, bb_signal_sample, P_code ++splitwb bd, ac, iqprod ++subb real, ac, bd ++x2 mullb iqprod, swapped, P_code ++splitwb bd, ac, iqprod ++addb imag, ac, bd ++mergebw real2, 0, real ++mergebw imag2, 0, imag ++accw P_out_real, real2 ++accw P_out_imag, imag2 ++ ++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl ++.source 2 input ++.source 2 carrier ++.source 2 L_code ++.accumulator 2 L_out_real ++.accumulator 2 L_out_imag ++ ++.temp 2 bb_signal_sample ++.temp 2 iqprod ++.temp 1 real ++.temp 1 imag ++.temp 1 ac ++.temp 1 bd ++.temp 2 swapped ++ ++.temp 2 real2 ++.temp 2 imag2 ++ ++x2 mullb iqprod, input, carrier ++splitwb bd, ac, iqprod ++subb real, ac, bd ++swapw swapped, input ++x2 mullb iqprod, swapped, carrier ++splitwb bd, ac, iqprod ++addb imag, ac, bd ++mergebw bb_signal_sample, real, imag ++ ++swapw swapped, bb_signal_sample ++ ++x2 mullb iqprod, bb_signal_sample, L_code ++splitwb bd, ac, iqprod ++subb real, ac, bd ++x2 mullb iqprod, swapped, L_code ++splitwb bd, ac, iqprod ++addb imag, ac, bd ++mergebw real2, 0, real ++mergebw imag2, 0, imag ++accw L_out_real, real2 ++accw L_out_imag, imag2 ++ ++ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc +--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 2014-10-15 01:55:08.000000000 +0200 +@@ -0,0 +1,39 @@ ++#/*! ++# * \file volk_gnsssdr_8u_x2_multiply_8u.orc ++# * \brief ORC implementation: multiplies unsigned char values ++# * \authors
    ++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++# *
++# * ++# * ORC code that multiplies unsigned char values (8 bits data) ++# * ++# * ------------------------------------------------------------------------- ++# * ++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++# * ++# * GNSS-SDR is a software defined Global Navigation ++# * Satellite Systems receiver ++# * ++# * This file is part of GNSS-SDR. ++# * ++# * GNSS-SDR is free software: you can redistribute it and/or modify ++# * it under the terms of the GNU General Public License as published by ++# * the Free Software Foundation, either version 3 of the License, or ++# * at your option) any later version. ++# * ++# * GNSS-SDR is distributed in the hope that it will be useful, ++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# * GNU General Public License for more details. ++# * ++# * You should have received a copy of the GNU General Public License ++# * along with GNSS-SDR. If not, see . ++# * ++# * ------------------------------------------------------------------------- ++# */ ++ ++.function volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl ++.source 1 src1 ++.source 1 src2 ++.dest 1 dst ++mullb dst, src1, src2 +diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch.patch +--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch.patch 2014-10-17 04:29:54.000000000 +0200 +@@ -0,0 +1,329 @@ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt ++--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 04:26:38.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-17 04:17:37.000000000 +0200 ++@@ -517,7 +517,19 @@ if(MSVC) ++ endif() ++ ++ #create the volk_gnsssdr runtime library ++-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) +++ +++#MODIFICATIONS BY GNSS-SDR +++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) +++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt) +++ +++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) +++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc}) +++ +++source_group("Kernels" FILES ${h_files}) +++source_group("Common Macros" FILES ${CommonMacros}) +++source_group("ORC Files" FILES ${orc}) +++#END OF MODIFICATIONS +++ ++ target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries}) ++ set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER}) ++ set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS") ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc ++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 04:21:03.000000000 +0200 ++@@ -217,6 +217,72 @@ inline void run_cast_test3_s32fc(volk_gn ++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++ } ++ +++//ADDED BY GNSS-SDR. START +++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +++} +++//ADDED BY GNSS-SDR. END +++ ++ // This function is a nop that helps resolve GNU Radio bugs 582 and 583. ++ // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast(float tol) ++ // won't happen on armhf (reported on cortex A9 and A15). ++@@ -426,7 +492,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr ++ } else { ++ run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++- } else throw "unsupported 1 arg function >1 scalars"; +++ } +++ //ADDED BY GNSS-SDR. START +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ //ADDED BY GNSS-SDR. END +++ else throw "unsupported 1 arg function >1 scalars"; ++ break; ++ case 2: ++ if(inputsc.size() == 0) { ++@@ -437,7 +513,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr ++ } else { ++ run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++- } else throw "unsupported 2 arg function >1 scalars"; +++ } +++ //ADDED BY GNSS-SDR. START +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ //ADDED BY GNSS-SDR. END +++ else throw "unsupported 2 arg function >1 scalars"; ++ break; ++ case 3: ++ if(inputsc.size() == 0) { ++@@ -448,11 +534,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr ++ } else { ++ run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++- } else throw "unsupported 3 arg function >1 scalars"; +++ } +++ //ADDED BY GNSS-SDR. START +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ //ADDED BY GNSS-SDR. END +++ else throw "unsupported 3 arg function >1 scalars"; ++ break; ++ case 4: ++ run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++ break; +++ //ADDED BY GNSS-SDR. START +++ case 8: +++ if(inputsc.size() == 0) { +++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ else throw "unsupported 8 arg function >1 scalars"; +++ break; +++ case 12: +++ if(inputsc.size() == 0) { +++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ else throw "unsupported 12 arg function >1 scalars"; +++ break; +++ //ADDED BY GNSS-SDR. END ++ default: ++ throw "no function handler for this signature"; ++ break; ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h ++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-17 04:21:51.000000000 +0200 ++@@ -77,4 +77,26 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f ++ typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); ++ typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++ +++//ADDED BY GNSS-SDR. START +++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input +++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input +++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); +++ +++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); +++ +++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); +++//ADDED BY GNSS-SDR. END +++ +++ ++ #endif //VOLK_QA_UTILS_H ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:23:30.000000000 +0200 ++@@ -19,8 +19,8 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_VOLK_RUNTIME ++-#define INCLUDED_VOLK_RUNTIME +++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME +++#define INCLUDED_VOLK_GNSSSDR_RUNTIME ++ ++ #include ++ #include ++@@ -91,4 +91,4 @@ extern VOLK_API volk_gnsssdr_func_desc_t ++ ++ __VOLK_DECL_END ++ ++-#endif /*INCLUDED_VOLK_RUNTIME*/ +++#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:22:58.000000000 +0200 ++@@ -19,11 +19,11 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H ++-#define INCLUDED_VOLK_CONFIG_FIXED_H +++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H +++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H ++ ++ #for $i, $arch in enumerate($archs) ++ #define LV_$(arch.name.upper()) $i ++ #end for ++ ++-#endif /*INCLUDED_VOLK_CONFIG_FIXED*/ +++#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:23:07.000000000 +0200 ++@@ -19,8 +19,8 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_VOLK_CPU_H ++-#define INCLUDED_VOLK_CPU_H +++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H +++#define INCLUDED_VOLK_GNSSSDR_CPU_H ++ ++ #include ++ ++@@ -39,4 +39,4 @@ unsigned int volk_gnsssdr_get_lvarch (); ++ ++ __VOLK_DECL_END ++ ++-#endif /*INCLUDED_VOLK_CPU_H*/ +++#endif /*INCLUDED_VOLK_GNSSSDR_CPU_H*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:23:16.000000000 +0200 ++@@ -19,8 +19,8 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_LIBVOLK_MACHINES_H ++-#define INCLUDED_LIBVOLK_MACHINES_H +++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H ++ ++ #include ++ #include ++@@ -52,4 +52,4 @@ extern struct volk_gnsssdr_machine volk_ ++ ++ __VOLK_DECL_END ++ ++-#endif //INCLUDED_LIBVOLK_MACHINES_H +++#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:23:23.000000000 +0200 ++@@ -19,8 +19,8 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_VOLK_TYPEDEFS ++-#define INCLUDED_VOLK_TYPEDEFS +++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS +++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS ++ ++ #include ++ #include ++@@ -29,4 +29,4 @@ ++ typedef void (*$(kern.pname))($kern.arglist_types); ++ #end for ++ ++-#endif /*INCLUDED_VOLK_TYPEDEFS*/ +++#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch +--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch 2014-10-17 04:27:54.000000000 +0200 +@@ -0,0 +1,38251 @@ ++Binary files /Users/andres/Desktop/volk_gnsssdr/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/.DS_Store differ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc ++--- /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc 2014-10-17 01:45:18.000000000 +0200 ++@@ -37,49 +37,6 @@ ++ ++ namespace fs = boost::filesystem; ++ ++-void write_json(std::ofstream &json_file, std::vector results) { ++- json_file << "{" << std::endl; ++- json_file << " \"volk_gnsssdr_tests\": [" << std::endl; ++- size_t len = results.size(); ++- size_t i = 0; ++- BOOST_FOREACH(volk_gnsssdr_test_results_t &result, results) { ++- json_file << " {" << std::endl; ++- json_file << " \"name\": \"" << result.name << "\"," << std::endl; ++- json_file << " \"vlen\": " << result.vlen << "," << std::endl; ++- json_file << " \"iter\": " << result.iter << "," << std::endl; ++- json_file << " \"best_arch_a\": \"" << result.best_arch_a ++- << "\"," << std::endl; ++- json_file << " \"best_arch_u\": \"" << result.best_arch_u ++- << "\"," << std::endl; ++- json_file << " \"results\": {" << std::endl; ++- size_t results_len = result.results.size(); ++- size_t ri = 0; ++- typedef std::pair tpair; ++- BOOST_FOREACH(tpair pair, result.results) { ++- volk_gnsssdr_test_time_t time = pair.second; ++- json_file << " \"" << time.name << "\": {" << std::endl; ++- json_file << " \"name\": \"" << time.name << "\"," << std::endl; ++- json_file << " \"time\": " << time.time << "," << std::endl; ++- json_file << " \"units\": \"" << time.units << "\"" << std::endl; ++- json_file << " }" ; ++- if(ri+1 != results_len) { ++- json_file << ","; ++- } ++- json_file << std::endl; ++- ri++; ++- } ++- json_file << " }" << std::endl; ++- json_file << " }"; ++- if(i+1 != len) { ++- json_file << ","; ++- } ++- json_file << std::endl; ++- i++; ++- } ++- json_file << " ]" << std::endl; ++- json_file << "}" << std::endl; ++-} ++- ++ int main(int argc, char *argv[]) { ++ // Adding program options ++ boost::program_options::options_description desc("Options"); ++@@ -92,9 +49,6 @@ int main(int argc, char *argv[]) { ++ ("tests-regex,R", ++ boost::program_options::value(), ++ "Run tests matching regular expression.") ++- ("json,j", ++- boost::program_options::value(), ++- "JSON output file") ++ ; ++ ++ // Handle the options that were given ++@@ -102,8 +56,6 @@ int main(int argc, char *argv[]) { ++ bool benchmark_mode; ++ std::string kernel_regex; ++ bool store_results = true; ++- std::ofstream json_file; ++- ++ try { ++ boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); ++ boost::program_options::notify(vm); ++@@ -131,14 +83,9 @@ int main(int argc, char *argv[]) { ++ return 0; ++ } ++ ++- if ( vm.count("json") ) ++- { ++- json_file.open( vm["json"].as().c_str() ); ++- } ++- ++ ++ // Run tests ++- std::vector results; +++ std::vector results; ++ ++ //VOLK_PROFILE(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); ++ //VOLK_PROFILE(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); ++@@ -155,6 +102,55 @@ int main(int argc, char *argv[]) { ++ ++ // Until we can update the config on a kernel by kernel basis ++ // do not overwrite volk_gnsssdr_config when using a regex. +++ +++ //GNSS-SDR PROTO-KERNELS +++ //lv_32fc_t sfv = lv_cmake((float)1, (float)2); +++ //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ +++ //CAN NOT BE TESTED YET BECAUSE VOLK MODULE DOES NOT SUPPORT IT: +++ //VOLK_PROFILE(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 16007, 1, &results, benchmark_mode, kernel_regex); +++ //VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 7, 1, &results, benchmark_mode, kernel_regex); +++ +++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ +++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ +++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ +++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex); +++ +++ /*VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); +++ VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);*/ +++ ++ if(store_results) { ++ char path[1024]; ++ volk_gnsssdr_get_config_path(path); ++@@ -178,10 +174,8 @@ int main(int argc, char *argv[]) { ++ #the function name is followed by the preferred architecture.\n\ ++ "; ++ ++- BOOST_FOREACH(volk_gnsssdr_test_results_t result, results) { ++- config << result.config_name << " " ++- << result.best_arch_a << " " ++- << result.best_arch_u << std::endl; +++ BOOST_FOREACH(std::string result, results) { +++ config << result << std::endl; ++ } ++ config.close(); ++ } ++Binary files /Users/andres/Desktop/volk_gnsssdr/kernels/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/kernels/.DS_Store differ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,174 @@ +++/*! +++ * \file CommonMacros.h +++ * \brief Common macros used inside the volk protokernels. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++#ifndef INCLUDED_gnsssdr_CommonMacros_u_H +++#define INCLUDED_gnsssdr_CommonMacros_u_H +++ +++ #ifdef LV_HAVE_SSE4_1 +++ /*! +++ \brief Macros for U_SSE4_1 +++ */ +++ +++ #ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 +++ #define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\ +++ imag = _mm_srli_si128 (input1, 2);\ +++ imag = _mm_blend_epi16 (input2, imag, 85);\ +++ real = _mm_slli_si128 (input2, 2);\ +++ real = _mm_blend_epi16 (real, input1, 85); +++ #endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */ +++ +++ #ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 +++ #define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\ +++ input_i_1 = _mm_cvtepi16_epi32(input);\ +++ input = _mm_srli_si128 (input, 8);\ +++ input_i_2 = _mm_cvtepi16_epi32(input);\ +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\ +++ output_ps = _mm_cvtepi32_ps(output_i32); +++ #endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ +++ +++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 +++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\ +++ input_i_1 = _mm_cvtepi8_epi32(input);\ +++ input = _mm_srli_si128 (input, 4);\ +++ input_i_2 = _mm_cvtepi8_epi32(input);\ +++ input = _mm_srli_si128 (input, 4);\ +++ output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\ +++ input_i_1 = _mm_cvtepi8_epi32(input);\ +++ input = _mm_srli_si128 (input, 4);\ +++ input_i_2 = _mm_cvtepi8_epi32(input);\ +++ input = _mm_srli_si128 (input, 4);\ +++ output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\ +++ output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\ +++ output_ps = _mm_cvtepi32_ps(output_i32); +++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ +++ +++ #endif /* LV_HAVE_SSE4_1 */ +++ +++ #ifdef LV_HAVE_SSE2 +++ /*! +++ \brief Macros for U_SSE2 +++ */ +++ +++ #ifdef LV_HAVE_SSSE3 +++ /*! +++ \brief Macros for U_SSSE3 +++ */ +++ +++ #ifndef CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 +++ #define CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)\ +++ y_aux = _mm_sign_epi8 (y, x);\ +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);\ +++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);\ +++ \ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);\ +++ y_aux = _mm_sign_epi8 (y_aux, x);\ +++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); +++ #endif /* CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 */ +++ +++ #endif /* LV_HAVE_SSSE3 */ +++ +++ #ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 +++ #define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);\ +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\ +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\ +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\ +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ #endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */ +++ +++ #ifndef CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 +++ #define CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(input, mult1, real, imag)\ +++ imag = _mm_srli_si128 (input, 1);\ +++ imag = _mm_and_si128 (imag, mult1);\ +++ real = _mm_and_si128 (input, mult1); +++ #endif /* CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 */ +++ +++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 +++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(input, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\ +++ input_i_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), input);\ +++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\ +++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\ +++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\ +++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\ +++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\ +++ output_ps_1 = _mm_cvtepi32_ps(output_i32);\ +++ \ +++ input_i_1 = _mm_unpackhi_epi8(_mm_setzero_si128(), input);\ +++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\ +++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\ +++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\ +++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\ +++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\ +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 */ +++ +++ #ifndef CM_8IC_CONTROLMINUS128_8IC_U_SSE2 +++ #define CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\ +++ minus128control = _mm_cmpeq_epi8 (y, minus128);\ +++ y = _mm_sub_epi8 (y, minus128control); +++ #endif /* CM_8IC_CONTROLMINUS128_8IC_U_SSE2 */ +++ +++ #endif /* LV_HAVE_SSE2 */ +++ +++ #ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Macros for U_GENERIC +++ */ +++ +++ #endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_CommonMacros_a_H +++#define INCLUDED_gnsssdr_CommonMacros_a_H +++ +++ #ifdef LV_HAVE_SSE4_1 +++ /*! +++ \brief Macros for A_SSE4_1 +++ */ +++ +++ #endif /* LV_HAVE_SSE4_1 */ +++ +++ #ifdef LV_HAVE_SSE2 +++ /*! +++ \brief Macros for U_SSE2 +++ */ +++ +++ #endif /* LV_HAVE_SSE2 */ +++ +++ #ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Macros for A_GENERIC +++ */ +++ +++ #endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,76 @@ +++/*! +++ * \file CommonMacros_16ic_cw_corr_32fc.h +++ * \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H +++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H +++#include "CommonMacros/CommonMacros.h" +++ +++ #ifdef LV_HAVE_SSE4_1 +++ /*! +++ \brief Macros for U_SSE4_1 +++ */ +++ +++ #ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 +++ #define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) +++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ +++ +++ #endif /* LV_HAVE_SSE4_1 */ +++ +++ #ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Macros for U_GENERIC +++ */ +++ +++ #endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H +++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H +++ +++ #ifdef LV_HAVE_SSE4_1 +++ /*! +++ \brief Macros for A_SSE4_1 +++ */ +++ +++ #endif /* LV_HAVE_SSE4_1 */ +++ +++ #ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Macros for A_GENERIC +++ */ +++ +++ #endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,114 @@ +++/*! +++ * \file CommonMacros_8ic_cw_corr_32fc.h +++ * \brief Common macros used inside the 8ic_cw_corr_32fc volk protokernels. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H +++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H +++#include "CommonMacros/CommonMacros.h" +++ +++ #ifdef LV_HAVE_SSE4_1 +++ /*! +++ \brief Macros for U_SSE4_1 +++ */ +++ +++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1 +++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ +++ \ +++ imag_output = _mm_slli_si128 (imag_output, 1);\ +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);\ +++ \ +++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ +++ +++ #ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 +++ #define CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ +++ CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\ +++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) +++ #endif /* CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 */ +++ +++ #ifndef CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 +++ #define CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ +++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) +++ #endif /* CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 */ +++ +++ #endif /* LV_HAVE_SSE4_1 */ +++ +++ #ifdef LV_HAVE_SSE2 +++ /*! +++ \brief Macros for U_SSE2 +++ */ +++ +++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 +++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ +++ \ +++ real_output = _mm_and_si128 (real_output, mult1);\ +++ imag_output = _mm_and_si128 (imag_output, mult1);\ +++ imag_output = _mm_slli_si128 (imag_output, 1);\ +++ output = _mm_or_si128 (real_output, imag_output);\ +++ \ +++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ #endif /* CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 */ +++ +++ #endif /* LV_HAVE_SSE2 */ +++ +++ #ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Macros for U_GENERIC +++ */ +++ +++ #endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H +++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H +++ +++ #ifdef LV_HAVE_SSE4_1 +++ /*! +++ \brief Macros for A_SSE4_1 +++ */ +++ +++ #endif /* LV_HAVE_SSE4_1 */ +++ +++ #ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Macros for A_GENERIC +++ */ +++ +++ #endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,34 @@ +++#################################################################### +++Common Macros inside volk_gnsssdr module +++#################################################################### +++ +++First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties. +++Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples. +++ +++#################################################################### +++Syntax +++#################################################################### +++ +++In order to allow better understanding of the code I created the macros with an specific syntax. +++ +++1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example: +++ +++example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) +++ +++First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output). +++The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile. +++ +++2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h +++ +++#################################################################### +++Workflow +++#################################################################### +++ +++In order to use the macros easily, I usually test the code without macros inside a testing proto-kernel, where you are able to test it, debug it and use breakpoints. +++When it works I place code inside a macro an I test it again. +++ +++#################################################################### +++Why macros +++#################################################################### +++1) They are the only way I could find for sharing code between proto-kernels without performance penalty. +++2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it. ++\ No newline at end of file ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,241 @@ +++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H +++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H +++ +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ +++ /*! +++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value +++ \param inputVector The 16 bit input data buffer +++ \param outputVector The floating point output data buffer +++ \param scalar The value divided against each point in the output buffer +++ \param num_points The number of data values to be converted +++ \note Output buffer does NOT need to be properly aligned +++ */ +++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0/scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m128i inputVal2; +++ __m128 ret; +++ +++ for(;number < eighthPoints; number++){ +++ +++ // Load the 8 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); +++ +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ +++ outputVectorPtr += 4; +++ +++ inputPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for(; number < num_points; number++){ +++ outputVector[number] =((float)(inputVector[number])) / scalar; +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE +++#include +++ +++ /*! +++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value +++ \param inputVector The 16 bit input data buffer +++ \param outputVector The floating point output data buffer +++ \param scalar The value divided against each point in the output buffer +++ \param num_points The number of data values to be converted +++ \note Output buffer does NOT need to be properly aligned +++ */ +++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0/scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128 ret; +++ +++ for(;number < quarterPoints; number++){ +++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); +++ +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ +++ inputPtr += 4; +++ outputVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(; number < num_points; number++){ +++ outputVector[number] = (float)(inputVector[number]) / scalar; +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value +++ \param inputVector The 16 bit input data buffer +++ \param outputVector The floating point output data buffer +++ \param scalar The value divided against each point in the output buffer +++ \param num_points The number of data values to be converted +++ \note Output buffer does NOT need to be properly aligned +++ */ +++static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +++ float* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */ +++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H +++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H +++ +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ +++ /*! +++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value +++ \param inputVector The 16 bit input data buffer +++ \param outputVector The floating point output data buffer +++ \param scalar The value divided against each point in the output buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int eighthPoints = num_points / 8; +++ +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0/scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128i inputVal; +++ __m128i inputVal2; +++ __m128 ret; +++ +++ for(;number < eighthPoints; number++){ +++ +++ // Load the 8 values +++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); +++ +++ // Shift the input data to the right by 64 bits ( 8 bytes ) +++ inputVal2 = _mm_srli_si128(inputVal, 8); +++ +++ // Convert the lower 4 values into 32 bit words +++ inputVal = _mm_cvtepi16_epi32(inputVal); +++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); +++ +++ ret = _mm_cvtepi32_ps(inputVal); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ outputVectorPtr += 4; +++ +++ ret = _mm_cvtepi32_ps(inputVal2); +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ +++ outputVectorPtr += 4; +++ +++ inputPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for(; number < num_points; number++){ +++ outputVector[number] =((float)(inputVector[number])) / scalar; +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE +++#include +++ +++ /*! +++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value +++ \param inputVector The 16 bit input data buffer +++ \param outputVector The floating point output data buffer +++ \param scalar The value divided against each point in the output buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* outputVectorPtr = outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0/scalar); +++ int16_t* inputPtr = (int16_t*)inputVector; +++ __m128 ret; +++ +++ for(;number < quarterPoints; number++){ +++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); +++ +++ ret = _mm_mul_ps(ret, invScalar); +++ _mm_storeu_ps(outputVectorPtr, ret); +++ +++ inputPtr += 4; +++ outputVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(; number < num_points; number++){ +++ outputVector[number] = (float)(inputVector[number]) / scalar; +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value +++ \param inputVector The 16 bit input data buffer +++ \param outputVector The floating point output data buffer +++ \param scalar The value divided against each point in the output buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ +++ float* outputVectorPtr = outputVector; +++ const int16_t* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,461 @@ +++/*! +++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the +++ * real part and 16 bits the imaginary part): +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 32 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 32 bits vectors), accumulating the results +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 32 bits vectors), accumulating the results +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 32 bits vectors), accumulating the results +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++ /*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; +++ +++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); +++ input_ptr += 4; +++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); +++ +++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ carrier_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get early values +++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ E_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ //Adds the float 32 results +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ P_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ L_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ input_ptr += 4; +++ carrier_ptr += 4; +++ E_code_ptr += 4; +++ P_code_ptr += 4; +++ L_code_ptr += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ } +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++ +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ lv_16sc_t bb_signal_sample; +++ lv_16sc_t tmp1; +++ lv_16sc_t tmp2; +++ lv_16sc_t tmp3; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ +++ tmp1 = bb_signal_sample * E_code[i]; +++ tmp2 = bb_signal_sample * P_code[i]; +++ tmp3 = bb_signal_sample * L_code[i]; +++ +++ // Now get early, late, and prompt values for each +++ *E_out += (lv_32fc_t)tmp1; +++ *P_out += (lv_32fc_t)tmp2; +++ *L_out += (lv_32fc_t)tmp3; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; +++ +++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x1 = _mm_load_si128((__m128i*)input_ptr); +++ input_ptr += 4; +++ x2 = _mm_load_si128((__m128i*)input_ptr); +++ +++ y1 = _mm_load_si128((__m128i*)carrier_ptr); +++ carrier_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get early values +++ y1 = _mm_load_si128((__m128i*)E_code_ptr); +++ E_code_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ //Adds the float 32 results +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y1 = _mm_load_si128((__m128i*)P_code_ptr); +++ P_code_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y1 = _mm_load_si128((__m128i*)L_code_ptr); +++ L_code_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ input_ptr += 4; +++ carrier_ptr += 4; +++ E_code_ptr += 4; +++ P_code_ptr += 4; +++ L_code_ptr += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ +++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ } +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++ +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ lv_16sc_t bb_signal_sample; +++ lv_16sc_t tmp1; +++ lv_16sc_t tmp2; +++ lv_16sc_t tmp3; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ +++ tmp1 = bb_signal_sample * E_code[i]; +++ tmp2 = bb_signal_sample * P_code[i]; +++ tmp3 = bb_signal_sample * L_code[i]; +++ +++ // Now get early, late, and prompt values for each +++ *E_out += (lv_32fc_t)tmp1; +++ *P_out += (lv_32fc_t)tmp2; +++ *L_out += (lv_32fc_t)tmp3; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,1568 @@ +++/*! +++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the +++ * real part and 16 bits the imaginary part): +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 32 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 32 bits vectors), accumulating the results +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 32 bits vectors), accumulating the results +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 32 bits vectors), accumulating the results +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++ /*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_first(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 4; +++ +++ __m128i x, y, yaux, yl, yh, tmp1, tmp2, z, bb_signal_sample, bb_signal_sample_suffled; +++ +++ __m128 z_ps_1, z_ps_2, z_E, z_P, z_L; +++ __m128i z_i_1, z_i_2; +++ +++ lv_32fc_t dotProduct_E; +++ lv_32fc_t dotProduct_P; +++ lv_32fc_t dotProduct_L; +++ +++ z_E = _mm_setzero_ps(); +++ z_P = _mm_setzero_ps(); +++ z_L = _mm_setzero_ps(); +++ +++ const lv_16sc_t* _input = input; +++ const lv_16sc_t* _carrier = carrier; +++ const lv_16sc_t* _E_code = E_code; +++ const lv_16sc_t* _P_code = P_code; +++ const lv_16sc_t* _L_code = L_code; +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++) +++ { +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_lddqu_si128((__m128i*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ // Load yl with cr,cr,dr,dr +++ // Load yh with ci,ci,di,di +++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); +++ yl = _mm_unpacklo_epi16(yaux, yaux); +++ yh = _mm_unpackhi_epi16(yaux, yaux); +++ +++ tmp1 = _mm_mullo_epi16(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_epi8 (x, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mullo_epi16(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); +++ bb_signal_sample = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ bb_signal_sample_suffled = _mm_shuffle_epi8 (bb_signal_sample, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange bb_signal_sample to be ai,ar,bi,br +++ +++ // correlation E,P,L (3x vector scalar product) +++ // Early +++ y = _mm_lddqu_si128((__m128i*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); +++ yl = _mm_unpacklo_epi16(yaux, yaux); +++ yh = _mm_unpackhi_epi16(yaux, yaux); +++ +++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); +++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_i_1 = _mm_cvtepi16_epi32(z); +++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); +++ z = _mm_srli_si128 (z, 8); +++ z_i_2 = _mm_cvtepi16_epi32(z); +++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); +++ +++ z_E = _mm_add_ps(z_E, z_ps_1); // Add the complex multiplication results together +++ z_E = _mm_add_ps(z_E, z_ps_2); // Add the complex multiplication results together +++ +++ // Prompt +++ y = _mm_lddqu_si128((__m128i*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); +++ yl = _mm_unpacklo_epi16(yaux, yaux); +++ yh = _mm_unpackhi_epi16(yaux, yaux); +++ +++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); +++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_i_1 = _mm_cvtepi16_epi32(z); +++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); +++ z = _mm_srli_si128 (z, 8); +++ z_i_2 = _mm_cvtepi16_epi32(z); +++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); +++ +++ z_P = _mm_add_ps(z_P, z_ps_1); // Add the complex multiplication results together +++ z_P = _mm_add_ps(z_P, z_ps_2); // Add the complex multiplication results together +++ +++ // Late +++ y = _mm_lddqu_si128((__m128i*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); +++ yl = _mm_unpacklo_epi16(yaux, yaux); +++ yh = _mm_unpackhi_epi16(yaux, yaux); +++ +++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); +++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_i_1 = _mm_cvtepi16_epi32(z); +++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); +++ z = _mm_srli_si128 (z, 8); +++ z_i_2 = _mm_cvtepi16_epi32(z); +++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); +++ +++ z_L = _mm_add_ps(z_L, z_ps_1); // Add the complex multiplication results together +++ z_L = _mm_add_ps(z_L, z_ps_2); // Add the complex multiplication results together +++ +++ _input += 4; +++ _carrier += 4; +++ _E_code += 4; +++ _L_code += 4; +++ _P_code += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; +++ +++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector +++ +++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); +++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); +++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); +++ } +++ +++ for(int i=0; i < num_points%4; ++i) +++ { +++ dotProduct_E += (lv_32fc_t)((*_input) * (*_E_code++)*(*_carrier)); +++ dotProduct_P += (lv_32fc_t)((*_input) * (*_P_code++)*(*_carrier)); +++ dotProduct_L += (lv_32fc_t)((*_input++) * (*_L_code++)*(*_carrier++)); +++ } +++ +++ *E_out = dotProduct_E; +++ *P_out = dotProduct_P; +++ *L_out = dotProduct_L; +++ +++ +++ +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; +++ +++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; +++ __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; +++ +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); +++ input_ptr += 4; +++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); +++ +++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ carrier_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ imagx = _mm_srli_si128 (x1, 2); +++ imagx = _mm_blend_epi16 (x2, imagx, 85); +++ realx = _mm_slli_si128 (x2, 2); +++ realx = _mm_blend_epi16 (realx, x1, 85); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ //Get early values +++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ E_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); +++ +++ //Get prompt values +++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ P_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); +++ +++ //Get late values +++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ L_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); +++ +++ input_ptr += 4; +++ carrier_ptr += 4; +++ E_code_ptr += 4; +++ L_code_ptr += 4; +++ P_code_ptr += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ } +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ unsigned int index = 0; +++ unsigned int indexPlus4 = 0; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; +++ +++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; +++ __m128 real_output_ps, imag_output_ps; +++ +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(index = 0;index < 8*sse_iters; index+=8){ +++ indexPlus4 = index + 4; +++ //Perform the carrier wipe-off +++ x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]); +++ x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]); +++ +++ y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]); +++ y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]); +++ +++ imagx = _mm_srli_si128 (x1, 2); +++ imagx = _mm_blend_epi16 (x2, imagx, 85); +++ realx = _mm_slli_si128 (x2, 2); +++ realx = _mm_blend_epi16 (realx, x1, 85); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ //Get early values +++ y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]); +++ y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]); +++ y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]); +++ y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ } +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(; index < num_points; index++) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input_ptr[index] * carrier_ptr[index]; +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; +++ +++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; +++ __m128 real_output_ps, imag_output_ps; +++ +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); +++ input_ptr += 4; +++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); +++ +++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ carrier_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ imagx = _mm_srli_si128 (x1, 2); +++ imagx = _mm_blend_epi16 (x2, imagx, 85); +++ realx = _mm_slli_si128 (x2, 2); +++ realx = _mm_blend_epi16 (realx, x1, 85); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ //Get early values +++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ E_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ P_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ L_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ imagy = _mm_srli_si128 (y1, 2); +++ imagy = _mm_blend_epi16 (y2, imagy, 85); +++ realy = _mm_slli_si128 (y2, 2); +++ realy = _mm_blend_epi16 (realy, y1, 85); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ input_ptr += 4; +++ carrier_ptr += 4; +++ E_code_ptr += 4; +++ L_code_ptr += 4; +++ P_code_ptr += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ } +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; +++ __m128i input_i_1, input_i_2, output_i32; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i realx, imagx, realy, imagy, real_output, imag_output; +++ +++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +++ __m128 real_output_ps, imag_output_ps; +++ +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); +++ input_ptr += 4; +++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); +++ +++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ carrier_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get early values +++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ E_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) +++ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ P_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) +++ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ L_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) +++ +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) +++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ input_ptr += 4; +++ carrier_ptr += 4; +++ E_code_ptr += 4; +++ L_code_ptr += 4; +++ P_code_ptr += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ } +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; +++ __m128i input_i_1, input_i_2, output_i32; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i realx, imagx, realy, imagy, real_output, imag_output; +++ +++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +++ __m128 real_output_ps, imag_output_ps; +++ +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); +++ input_ptr += 4; +++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); +++ +++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ carrier_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get early values +++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ E_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ P_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ L_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ input_ptr += 4; +++ carrier_ptr += 4; +++ E_code_ptr += 4; +++ L_code_ptr += 4; +++ P_code_ptr += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ } +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ lv_16sc_t bb_signal_sample; +++ lv_16sc_t tmp1; +++ lv_16sc_t tmp2; +++ lv_16sc_t tmp3; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ +++ tmp1 = bb_signal_sample * E_code[i]; +++ tmp2 = bb_signal_sample * P_code[i]; +++ tmp3 = bb_signal_sample * L_code[i]; +++ +++ // Now get early, late, and prompt values for each +++ *E_out += (lv_32fc_t)tmp1; +++ *P_out += (lv_32fc_t)tmp2; +++ *L_out += (lv_32fc_t)tmp3; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H +++ +++#include +++#include +++#include +++#include +++#include +++// +++//#ifdef LV_HAVE_SSE4_1 +++//#include "smmintrin.h" +++///*! +++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++// \param input The input signal input +++// \param carrier The carrier signal input +++// \param E_code Early PRN code replica input +++// \param P_code Early PRN code replica input +++// \param L_code Early PRN code replica input +++// \param E_out Early correlation output +++// \param P_out Early correlation output +++// \param L_out Early correlation output +++// \param num_points The number of complex values in vectors +++// */ +++//static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++//{ +++// const unsigned int sse_iters = num_points / 8; +++// +++// __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; +++// +++// __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; +++// __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; +++// __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; +++// +++// float E_out_real = 0; +++// float E_out_imag = 0; +++// float P_out_real = 0; +++// float P_out_imag = 0; +++// float L_out_real = 0; +++// float L_out_imag = 0; +++// +++// const lv_16sc_t* input_ptr = input; +++// const lv_16sc_t* carrier_ptr = carrier; +++// +++// const lv_16sc_t* E_code_ptr = E_code; +++// lv_32fc_t* E_out_ptr = E_out; +++// const lv_16sc_t* L_code_ptr = L_code; +++// lv_32fc_t* L_out_ptr = L_out; +++// const lv_16sc_t* P_code_ptr = P_code; +++// lv_32fc_t* P_out_ptr = P_out; +++// +++// *E_out_ptr = 0; +++// *P_out_ptr = 0; +++// *L_out_ptr = 0; +++// +++// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++// +++// real_E_code_acc = _mm_setzero_ps(); +++// imag_E_code_acc = _mm_setzero_ps(); +++// real_P_code_acc = _mm_setzero_ps(); +++// imag_P_code_acc = _mm_setzero_ps(); +++// real_L_code_acc = _mm_setzero_ps(); +++// imag_L_code_acc = _mm_setzero_ps(); +++// +++// if (sse_iters>0) +++// { +++// for(int number = 0;number < sse_iters; number++){ +++// +++// //Perform the carrier wipe-off +++// x1 = _mm_lddqu_si128((__m128i*)input_ptr); +++// input_ptr += 4; +++// x2 = _mm_lddqu_si128((__m128i*)input_ptr); +++// +++// y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++// carrier_ptr += 4; +++// y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++// +++// imagx = _mm_srli_si128 (x1, 2); +++// imagx = _mm_blend_epi16 (x2, imagx, 85); +++// realx = _mm_slli_si128 (x2, 2); +++// realx = _mm_blend_epi16 (realx, x1, 85); +++// +++// imagy = _mm_srli_si128 (y1, 2); +++// imagy = _mm_blend_epi16 (y2, imagy, 85); +++// realy = _mm_slli_si128 (y2, 2); +++// realy = _mm_blend_epi16 (realy, y1, 85); +++// +++// realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++// +++// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++// +++// //Get early values +++// y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++// E_code_ptr += 4; +++// y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++// +++// imagy = _mm_srli_si128 (y1, 2); +++// imagy = _mm_blend_epi16 (y2, imagy, 85); +++// realy = _mm_slli_si128 (y2, 2); +++// realy = _mm_blend_epi16 (realy, y1, 85); +++// +++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++// +++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++// +++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +++// real_output = _mm_srli_si128 (real_output, 8); +++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +++// +++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +++// imag_output = _mm_srli_si128 (imag_output, 8); +++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +++// +++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); +++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); +++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); +++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); +++// +++// //Get prompt values +++// y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++// P_code_ptr += 4; +++// y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++// +++// imagy = _mm_srli_si128 (y1, 2); +++// imagy = _mm_blend_epi16 (y2, imagy, 85); +++// realy = _mm_slli_si128 (y2, 2); +++// realy = _mm_blend_epi16 (realy, y1, 85); +++// +++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++// +++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++// +++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +++// real_output = _mm_srli_si128 (real_output, 8); +++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +++// +++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +++// imag_output = _mm_srli_si128 (imag_output, 8); +++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +++// +++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); +++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); +++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); +++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); +++// +++// //Get late values +++// y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++// L_code_ptr += 4; +++// y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++// +++// imagy = _mm_srli_si128 (y1, 2); +++// imagy = _mm_blend_epi16 (y2, imagy, 85); +++// realy = _mm_slli_si128 (y2, 2); +++// realy = _mm_blend_epi16 (realy, y1, 85); +++// +++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++// +++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++// +++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); +++// real_output = _mm_srli_si128 (real_output, 8); +++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); +++// +++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); +++// imag_output = _mm_srli_si128 (imag_output, 8); +++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); +++// +++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); +++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); +++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); +++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); +++// +++// input_ptr += 4; +++// carrier_ptr += 4; +++// E_code_ptr += 4; +++// L_code_ptr += 4; +++// P_code_ptr += 4; +++// } +++// +++// __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++// __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++// __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++// __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++// __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++// __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++// +++// _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++// _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++// _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++// _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++// _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++// _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++// +++// for (int i = 0; i<4; ++i) +++// { +++// E_out_real += real_E_dotProductVector[i]; +++// E_out_imag += imag_E_dotProductVector[i]; +++// P_out_real += real_P_dotProductVector[i]; +++// P_out_imag += imag_P_dotProductVector[i]; +++// L_out_real += real_L_dotProductVector[i]; +++// L_out_imag += imag_L_dotProductVector[i]; +++// } +++// *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++// *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++// *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++// } +++// +++// lv_16sc_t bb_signal_sample; +++// for(int i=0; i < num_points%8; ++i) +++// { +++// //Perform the carrier wipe-off +++// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++// // Now get early, late, and prompt values for each +++// *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++// *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++// *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++// } +++//} +++//#endif /* LV_HAVE_SSE4_1 */ +++// +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) +++{ +++ lv_16sc_t bb_signal_sample; +++ lv_16sc_t tmp1; +++ lv_16sc_t tmp2; +++ lv_16sc_t tmp3; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ +++ tmp1 = bb_signal_sample * E_code[i]; +++ tmp2 = bb_signal_sample * P_code[i]; +++ tmp3 = bb_signal_sample * L_code[i]; +++ +++ // Now get early, late, and prompt values for each +++ *E_out += (lv_32fc_t)tmp1; +++ *P_out += (lv_32fc_t)tmp2; +++ *L_out += (lv_32fc_t)tmp3; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,595 @@ +++/*! +++ * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the +++ * real part and 16 bits the imaginary part) and accumulates into float32 values, returning them: +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 32 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Very Early values are calculated by multiplying the input signal in BB by the +++ * very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results +++ * - Very Late values are calculated by multiplying the input signal in BB by the +++ * very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++ /*! +++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; +++ +++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ +++ float VE_out_real = 0; +++ float VE_out_imag = 0; +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ float VL_out_real = 0; +++ float VL_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_16sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ *VE_out_ptr = 0; +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ *VL_out_ptr = 0; +++ +++ real_VE_code_acc = _mm_setzero_ps(); +++ imag_VE_code_acc = _mm_setzero_ps(); +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ real_VL_code_acc = _mm_setzero_ps(); +++ imag_VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); +++ input_ptr += 4; +++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); +++ +++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ carrier_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get very early values +++ y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ VE_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); +++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); +++ +++ //Get early values +++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ E_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ P_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ L_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ //Get very late values +++ y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ VL_code_ptr += 4; +++ y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); +++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); +++ +++ input_ptr += 4; +++ carrier_ptr += 4; +++ VE_code_ptr += 4; +++ E_code_ptr += 4; +++ P_code_ptr += 4; +++ L_code_ptr += 4; +++ VL_code_ptr += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ VE_out_real += real_VE_dotProductVector[i]; +++ VE_out_imag += imag_VE_dotProductVector[i]; +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ VL_out_real += real_VL_dotProductVector[i]; +++ VL_out_imag += imag_VL_dotProductVector[i]; +++ } +++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); +++ } +++ +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +++{ +++ lv_16sc_t bb_signal_sample; +++ lv_16sc_t tmp1; +++ lv_16sc_t tmp2; +++ lv_16sc_t tmp3; +++ lv_16sc_t tmp4; +++ lv_16sc_t tmp5; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ // perform Early, Prompt and Late correlation +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ +++ tmp1 = bb_signal_sample * VE_code[i]; +++ tmp2 = bb_signal_sample * E_code[i]; +++ tmp3 = bb_signal_sample * P_code[i]; +++ tmp4 = bb_signal_sample * L_code[i]; +++ tmp5 = bb_signal_sample * VL_code[i]; +++ +++ // Now get early, late, and prompt values for each +++ *VE_out += (lv_32fc_t)tmp1; +++ *E_out += (lv_32fc_t)tmp2; +++ *P_out += (lv_32fc_t)tmp3; +++ *L_out += (lv_32fc_t)tmp4; +++ *VL_out += (lv_32fc_t)tmp5; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; +++ +++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ +++ float VE_out_real = 0; +++ float VE_out_imag = 0; +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ float VL_out_real = 0; +++ float VL_out_imag = 0; +++ +++ const lv_16sc_t* input_ptr = input; +++ const lv_16sc_t* carrier_ptr = carrier; +++ +++ const lv_16sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_16sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_16sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_16sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_16sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ *VE_out_ptr = 0; +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ *VL_out_ptr = 0; +++ +++ real_VE_code_acc = _mm_setzero_ps(); +++ imag_VE_code_acc = _mm_setzero_ps(); +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ real_VL_code_acc = _mm_setzero_ps(); +++ imag_VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x1 = _mm_load_si128((__m128i*)input_ptr); +++ input_ptr += 4; +++ x2 = _mm_load_si128((__m128i*)input_ptr); +++ +++ y1 = _mm_load_si128((__m128i*)carrier_ptr); +++ carrier_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) +++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get very early values +++ y1 = _mm_load_si128((__m128i*)VE_code_ptr); +++ VE_code_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)VE_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); +++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); +++ +++ //Get early values +++ y1 = _mm_load_si128((__m128i*)E_code_ptr); +++ E_code_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y1 = _mm_load_si128((__m128i*)P_code_ptr); +++ P_code_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y1 = _mm_load_si128((__m128i*)L_code_ptr); +++ L_code_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ //Get very late values +++ y1 = _mm_load_si128((__m128i*)VL_code_ptr); +++ VL_code_ptr += 4; +++ y2 = _mm_load_si128((__m128i*)VL_code_ptr); +++ +++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); +++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); +++ +++ input_ptr += 4; +++ carrier_ptr += 4; +++ VE_code_ptr += 4; +++ E_code_ptr += 4; +++ P_code_ptr += 4; +++ L_code_ptr += 4; +++ VL_code_ptr += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; +++ +++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ VE_out_real += real_VE_dotProductVector[i]; +++ VE_out_imag += imag_VE_dotProductVector[i]; +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ VL_out_real += real_VL_dotProductVector[i]; +++ VL_out_imag += imag_VL_dotProductVector[i]; +++ } +++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); +++ } +++ +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) +++{ +++ lv_16sc_t bb_signal_sample; +++ lv_16sc_t tmp1; +++ lv_16sc_t tmp2; +++ lv_16sc_t tmp3; +++ lv_16sc_t tmp4; +++ lv_16sc_t tmp5; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ // perform Early, Prompt and Late correlation +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ +++ tmp1 = bb_signal_sample * VE_code[i]; +++ tmp2 = bb_signal_sample * E_code[i]; +++ tmp3 = bb_signal_sample * P_code[i]; +++ tmp4 = bb_signal_sample * L_code[i]; +++ tmp5 = bb_signal_sample * VL_code[i]; +++ +++ // Now get early, late, and prompt values for each +++ *VE_out += (lv_32fc_t)tmp1; +++ *E_out += (lv_32fc_t)tmp2; +++ *P_out += (lv_32fc_t)tmp3; +++ *L_out += (lv_32fc_t)tmp4; +++ *VL_out += (lv_32fc_t)tmp5; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,68 @@ +++#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H +++#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++*/ +++static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){ +++ float returnValue = 0; +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* aPtr = inputBuffer; +++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; +++ +++ __m128 accumulator = _mm_setzero_ps(); +++ __m128 aVal = _mm_setzero_ps(); +++ +++ for(;number < quarterPoints; number++){ +++ aVal = _mm_load_ps(aPtr); +++ accumulator = _mm_add_ps(accumulator, aVal); +++ aPtr += 4; +++ } +++ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container +++ returnValue = tempBuffer[0]; +++ returnValue += tempBuffer[1]; +++ returnValue += tempBuffer[2]; +++ returnValue += tempBuffer[3]; +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++){ +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++*/ +++static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){ +++ const float* aPtr = inputBuffer; +++ unsigned int number = 0; +++ float returnValue = 0; +++ +++ for(;number < num_points; number++){ +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,149 @@ +++#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H +++#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ +++static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* inputPtr = (float*)src0; +++ +++ __m128 indexIncrementValues = _mm_set1_ps(4); +++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); +++ +++ float max = src0[0]; +++ float index = 0; +++ __m128 maxValues = _mm_set1_ps(max); +++ __m128 maxValuesIndex = _mm_setzero_ps(); +++ __m128 compareResults; +++ __m128 currentValues; +++ +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ +++ for(;number < quarterPoints; number++){ +++ +++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4; +++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +++ +++ compareResults = _mm_cmpgt_ps(maxValues, currentValues); +++ +++ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); +++ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); +++ } +++ +++ // Calculate the largest value from the remaining 4 points +++ _mm_store_ps(maxValuesBuffer, maxValues); +++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for(number = 0; number < 4; number++){ +++ if(maxValuesBuffer[number] > max){ +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } +++ } +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++){ +++ if(src0[number] > max){ +++ index = number; +++ max = src0[number]; +++ } +++ } +++ target[0] = (unsigned int)index; +++ } +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++#ifdef LV_HAVE_SSE +++#include +++ +++static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* inputPtr = (float*)src0; +++ +++ __m128 indexIncrementValues = _mm_set1_ps(4); +++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); +++ +++ float max = src0[0]; +++ float index = 0; +++ __m128 maxValues = _mm_set1_ps(max); +++ __m128 maxValuesIndex = _mm_setzero_ps(); +++ __m128 compareResults; +++ __m128 currentValues; +++ +++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; +++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; +++ +++ for(;number < quarterPoints; number++){ +++ +++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4; +++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); +++ +++ compareResults = _mm_cmpgt_ps(maxValues, currentValues); +++ +++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); +++ +++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); +++ } +++ +++ // Calculate the largest value from the remaining 4 points +++ _mm_store_ps(maxValuesBuffer, maxValues); +++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); +++ +++ for(number = 0; number < 4; number++){ +++ if(maxValuesBuffer[number] > max){ +++ index = maxIndexesBuffer[number]; +++ max = maxValuesBuffer[number]; +++ } +++ } +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++){ +++ if(src0[number] > max){ +++ index = number; +++ max = src0[number]; +++ } +++ } +++ target[0] = (unsigned int)index; +++ } +++} +++ +++#endif /*LV_HAVE_SSE*/ +++ +++#ifdef LV_HAVE_GENERIC +++static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ float max = src0[0]; +++ unsigned int index = 0; +++ +++ unsigned int i = 1; +++ +++ for(; i < num_points; ++i) { +++ +++ if(src0[i] > max){ +++ index = i; +++ max = src0[i]; +++ } +++ +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++ +++#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,302 @@ +++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H +++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include +++ /*! +++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param scalar The value multiplied against each point in the input buffer +++ \param num_points The number of data values to be converted +++ \note Input buffer does NOT need to be properly aligned +++ */ +++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = -32768; +++ float max_val = 32767; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 inputVal1, inputVal2; +++ __m128i intInputVal1, intInputVal2; +++ __m128 ret1, ret2; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(;number < eighthPoints; number++){ +++ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; +++ +++ // Scale and clip +++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for(; number < num_points; number++){ +++ r = inputVector[number] * scalar; +++ if(r > max_val) +++ r = max_val; +++ else if(r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_SSE +++#include +++ /*! +++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param scalar The value multiplied against each point in the input buffer +++ \param num_points The number of data values to be converted +++ \note Input buffer does NOT need to be properly aligned +++ */ +++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = -32768; +++ float max_val = 32767; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 ret; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ +++ for(;number < quarterPoints; number++){ +++ ret = _mm_loadu_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ // Scale and clip +++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); +++ +++ _mm_store_ps(outputFloatBuffer, ret); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); +++ } +++ +++ number = quarterPoints * 4; +++ for(; number < num_points; number++){ +++ r = inputVector[number] * scalar; +++ if(r > max_val) +++ r = max_val; +++ else if(r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param scalar The value multiplied against each point in the input buffer +++ \param num_points The number of data values to be converted +++ \note Input buffer does NOT need to be properly aligned +++ */ +++static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +++ int16_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ float min_val = -32768; +++ float max_val = 32767; +++ float r; +++ +++ for(number = 0; number < num_points; number++){ +++ r = *inputVectorPtr++ * scalar; +++ if(r > max_val) +++ r = max_val; +++ else if(r < min_val) +++ r = min_val; +++ *outputVectorPtr++ = (int16_t)rintf(r); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */ +++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H +++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include +++ /*! +++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param scalar The value multiplied against each point in the input buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ +++ const unsigned int eighthPoints = num_points / 8; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = -32768; +++ float max_val = 32767; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 inputVal1, inputVal2; +++ __m128i intInputVal1, intInputVal2; +++ __m128 ret1, ret2; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(;number < eighthPoints; number++){ +++ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; +++ +++ // Scale and clip +++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ number = eighthPoints * 8; +++ for(; number < num_points; number++){ +++ r = inputVector[number] * scalar; +++ if(r > max_val) +++ r = max_val; +++ else if(r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_SSE +++#include +++ /*! +++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param scalar The value multiplied against each point in the input buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* inputVectorPtr = (const float*)inputVector; +++ int16_t* outputVectorPtr = outputVector; +++ +++ float min_val = -32768; +++ float max_val = 32767; +++ float r; +++ +++ __m128 vScalar = _mm_set_ps1(scalar); +++ __m128 ret; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; +++ +++ for(;number < quarterPoints; number++){ +++ ret = _mm_load_ps(inputVectorPtr); +++ inputVectorPtr += 4; +++ +++ // Scale and clip +++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); +++ +++ _mm_store_ps(outputFloatBuffer, ret); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); +++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); +++ } +++ +++ number = quarterPoints * 4; +++ for(; number < num_points; number++){ +++ r = inputVector[number] * scalar; +++ if(r > max_val) +++ r = max_val; +++ else if(r < min_val) +++ r = min_val; +++ outputVector[number] = (int16_t)rintf(r); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param scalar The value multiplied against each point in the input buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ +++ int16_t* outputVectorPtr = outputVector; +++ const float* inputVectorPtr = inputVector; +++ unsigned int number = 0; +++ float min_val = -32768; +++ float max_val = 32767; +++ float r; +++ +++ for(number = 0; number < num_points; number++){ +++ r = *inputVectorPtr++ * scalar; +++ if(r < min_val) +++ r = min_val; +++ else if(r > max_val) +++ r = max_val; +++ *outputVectorPtr++ = (int16_t)rintf(r); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,147 @@ +++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H +++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H +++ +++#include +++#include +++ +++#ifdef LV_HAVE_SSE +++#include +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++*/ +++static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr= bVector; +++ +++ __m128 aVal, bVal, cVal; +++ for(;number < quarterPoints; number++){ +++ +++ aVal = _mm_loadu_ps(aPtr); +++ bVal = _mm_loadu_ps(bPtr); +++ +++ cVal = _mm_add_ps(aVal, bVal); +++ +++ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container +++ +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++){ +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++*/ +++static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr= bVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */ +++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H +++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H +++ +++#include +++#include +++ +++#ifdef LV_HAVE_SSE +++#include +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++*/ +++static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr= bVector; +++ +++ __m128 aVal, bVal, cVal; +++ for(;number < quarterPoints; number++){ +++ +++ aVal = _mm_load_ps(aPtr); +++ bVal = _mm_load_ps(bPtr); +++ +++ cVal = _mm_add_ps(aVal, bVal); +++ +++ _mm_store_ps(cPtr,cVal); // Store the results back into the C container +++ +++ aPtr += 4; +++ bPtr += 4; +++ cPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(;number < num_points; number++){ +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++*/ +++static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +++ float* cPtr = cVector; +++ const float* aPtr = aVector; +++ const float* bPtr= bVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++*/ +++extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); +++static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ +++ volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); +++} +++#endif /* LV_HAVE_ORC */ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,127 @@ +++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H +++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++ /*! +++ \brief Takes the conjugate of a complex vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ +++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); +++ +++ for(;number < halfPoints; number++){ +++ +++ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi +++ +++ x = _mm_xor_ps(x, conjugator); // conjugate register +++ +++ _mm_storeu_ps((float*)c,x); // Store the results back into the C container +++ +++ a += 2; +++ c += 2; +++ } +++ +++ if((num_points % 2) != 0) { +++ *c = lv_conj(*a); +++ } +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Takes the conjugate of a complex vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = lv_conj(*aPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */ +++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H +++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++ /*! +++ \brief Takes the conjugate of a complex vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ +++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); +++ +++ for(;number < halfPoints; number++){ +++ +++ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi +++ +++ x = _mm_xor_ps(x, conjugator); // conjugate register +++ +++ _mm_store_ps((float*)c,x); // Store the results back into the C container +++ +++ a += 2; +++ c += 2; +++ } +++ +++ if((num_points % 2) != 0) { +++ *c = lv_conj(*a); +++ } +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Takes the conjugate of a complex vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = lv_conj(*aPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,295 @@ +++/*! +++ * \file volk_gnsssdr_32fc_convert_16ic.h +++ * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H +++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points/4; +++ +++ float* inputVectorPtr = (float*)inputVector; +++ int16_t* outputVectorPtr = (int16_t*)outputVector; +++ +++ float min_val = -32768; +++ float max_val = 32767; +++ +++ __m128 inputVal1, inputVal2; +++ __m128i intInputVal1, intInputVal2; +++ __m128 ret1, ret2; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(unsigned int i = 0;i < sse_iters; i++){ +++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_SSE +++#include +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points/4; +++ +++ float* inputVectorPtr = (float*)inputVector; +++ int16_t* outputVectorPtr = (int16_t*)outputVector; +++ +++ float min_val = -32768; +++ float max_val = 32767; +++ +++ __m128 inputVal1, inputVal2; +++ __m128i intInputVal1, intInputVal2; +++ __m128 ret1, ret2; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(unsigned int i = 0;i < sse_iters; i++){ +++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ float* inputVectorPtr = (float*)inputVector; +++ int16_t* outputVectorPtr = (int16_t*)outputVector; +++ float min_val = -32768; +++ float max_val = 32767; +++ +++ for(unsigned int i = 0; i < num_points*2; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H +++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points/4; +++ +++ float* inputVectorPtr = (float*)inputVector; +++ int16_t* outputVectorPtr = (int16_t*)outputVector; +++ +++ float min_val = -32768; +++ float max_val = 32767; +++ +++ __m128 inputVal1, inputVal2; +++ __m128i intInputVal1, intInputVal2; +++ __m128 ret1, ret2; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(unsigned int i = 0;i < sse_iters; i++){ +++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_SSE +++#include +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points/4; +++ +++ float* inputVectorPtr = (float*)inputVector; +++ int16_t* outputVectorPtr = (int16_t*)outputVector; +++ +++ float min_val = -32768; +++ float max_val = 32767; +++ +++ __m128 inputVal1, inputVal2; +++ __m128i intInputVal1, intInputVal2; +++ __m128 ret1, ret2; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(unsigned int i = 0;i < sse_iters; i++){ +++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); +++ outputVectorPtr += 8; +++ } +++ +++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ float* inputVectorPtr = (float*)inputVector; +++ int16_t* outputVectorPtr = (int16_t*)outputVector; +++ float min_val = -32768; +++ float max_val = 32767; +++ +++ for(unsigned int i = 0; i < num_points*2; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,213 @@ +++/*! +++ * \file volk_gnsssdr_32fc_convert_8ic.h +++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H +++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points/8; +++ +++ float* inputVectorPtr = (float*)inputVector; +++ int8_t* outputVectorPtr = (int8_t*)outputVector; +++ +++ float min_val = -128; +++ float max_val = 127; +++ +++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; +++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; +++ __m128i int8InputVal; +++ __m128 ret1, ret2, ret3, ret4; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(unsigned int i = 0;i < sse_iters; i++){ +++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); +++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ intInputVal3 = _mm_cvtps_epi32(ret3); +++ intInputVal4 = _mm_cvtps_epi32(ret4); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); +++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); +++ outputVectorPtr += 16; +++ } +++ +++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ float* inputVectorPtr = (float*)inputVector; +++ int8_t* outputVectorPtr = (int8_t*)outputVector; +++ float min_val = -128; +++ float max_val = 127; +++ +++ for(unsigned int i = 0; i < num_points*2; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H +++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points/8; +++ +++ float* inputVectorPtr = (float*)inputVector; +++ int8_t* outputVectorPtr = (int8_t*)outputVector; +++ +++ float min_val = -128; +++ float max_val = 127; +++ +++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; +++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; +++ __m128i int8InputVal; +++ __m128 ret1, ret2, ret3, ret4; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(unsigned int i = 0;i < sse_iters; i++){ +++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); +++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ intInputVal3 = _mm_cvtps_epi32(ret3); +++ intInputVal4 = _mm_cvtps_epi32(ret4); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); +++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); +++ outputVectorPtr += 16; +++ } +++ +++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ +++ float* inputVectorPtr = (float*)inputVector; +++ int8_t* outputVectorPtr = (int8_t*)outputVector; +++ float min_val = -128; +++ float max_val = 127; +++ +++ for(unsigned int i = 0; i < num_points*2; i++){ +++ if(inputVectorPtr[i] > max_val) +++ inputVectorPtr[i] = max_val; +++ else if(inputVectorPtr[i] < min_val) +++ inputVectorPtr[i] = min_val; +++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,228 @@ +++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H +++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++ /*! +++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, result; +++ for(;number < quarterPoints; number++){ +++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ +++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ +++ _mm_storeu_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(; number < num_points; number++){ +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_SSE +++#include +++ /*! +++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, iValue, qValue, result; +++ for(;number < quarterPoints; number++){ +++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ +++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values +++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +++ +++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +++ +++ _mm_storeu_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(; number < num_points; number++){ +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ for(number = 0; number < num_points; number++){ +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ +++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H +++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++ /*! +++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, result; +++ for(;number < quarterPoints; number++){ +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values +++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values +++ +++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +++ +++ _mm_store_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(; number < num_points; number++){ +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_SSE +++#include +++ /*! +++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int quarterPoints = num_points / 4; +++ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128 cplxValue1, cplxValue2, iValue, qValue, result; +++ for(;number < quarterPoints; number++){ +++ cplxValue1 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ cplxValue2 = _mm_load_ps(complexVectorPtr); +++ complexVectorPtr += 4; +++ +++ // Arrange in i1i2i3i4 format +++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +++ // Arrange in q1q2q3q4 format +++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++ +++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values +++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +++ +++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +++ +++ _mm_store_ps(magnitudeVectorPtr, result); +++ magnitudeVectorPtr += 4; +++ } +++ +++ number = quarterPoints * 4; +++ for(; number < num_points; number++){ +++ float val1Real = *complexVectorPtr++; +++ float val1Imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +++ const float* complexVectorPtr = (float*)complexVector; +++ float* magnitudeVectorPtr = magnitudeVector; +++ unsigned int number = 0; +++ for(number = 0; number < num_points; number++){ +++ const float real = *complexVectorPtr++; +++ const float imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,231 @@ +++/*! +++ * \file volk_gnsssdr_32fc_s32f_convert_8ic.h +++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H +++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ +++ const unsigned int sse_iters = num_points/8; +++ +++ float* inputVectorPtr = (float*)inputVector; +++ int8_t* outputVectorPtr = (int8_t*)outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0/scalar); +++ +++ float min_val = -128; +++ float max_val = 127; +++ +++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; +++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; +++ __m128i int8InputVal; +++ __m128 ret1, ret2, ret3, ret4; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(unsigned int i = 0;i < sse_iters; i++){ +++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ +++ inputVal1 = _mm_mul_ps(inputVal1, invScalar); +++ inputVal2 = _mm_mul_ps(inputVal2, invScalar); +++ inputVal3 = _mm_mul_ps(inputVal3, invScalar); +++ inputVal4 = _mm_mul_ps(inputVal4, invScalar); +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); +++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ intInputVal3 = _mm_cvtps_epi32(ret3); +++ intInputVal4 = _mm_cvtps_epi32(ret4); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); +++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); +++ +++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); +++ outputVectorPtr += 16; +++ } +++ +++ float scaled = 0; +++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ +++ scaled = inputVectorPtr[i]/scalar; +++ if(scaled > max_val) +++ scaled = max_val; +++ else if(scaled < min_val) +++ scaled = min_val; +++ outputVectorPtr[i] = (int8_t)rintf(scaled); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ +++ float* inputVectorPtr = (float*)inputVector; +++ int8_t* outputVectorPtr = (int8_t*)outputVector; +++ float scaled = 0; +++ float min_val = -128; +++ float max_val = 127; +++ +++ for(unsigned int i = 0; i < num_points*2; i++){ +++ scaled = (inputVectorPtr[i])/scalar; +++ if(scaled > max_val) +++ scaled = max_val; +++ else if(scaled < min_val) +++ scaled = min_val; +++ outputVectorPtr[i] = (int8_t)rintf(scaled); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H +++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ +++ const unsigned int sse_iters = num_points/8; +++ +++ float* inputVectorPtr = (float*)inputVector; +++ int8_t* outputVectorPtr = (int8_t*)outputVector; +++ __m128 invScalar = _mm_set_ps1(1.0/scalar); +++ +++ float min_val = -128; +++ float max_val = 127; +++ +++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; +++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; +++ __m128i int8InputVal; +++ __m128 ret1, ret2, ret3, ret4; +++ __m128 vmin_val = _mm_set_ps1(min_val); +++ __m128 vmax_val = _mm_set_ps1(max_val); +++ +++ for(unsigned int i = 0;i < sse_iters; i++){ +++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; +++ +++ inputVal1 = _mm_mul_ps(inputVal1, invScalar); +++ inputVal2 = _mm_mul_ps(inputVal2, invScalar); +++ inputVal3 = _mm_mul_ps(inputVal3, invScalar); +++ inputVal4 = _mm_mul_ps(inputVal4, invScalar); +++ // Clip +++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); +++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); +++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); +++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); +++ +++ intInputVal1 = _mm_cvtps_epi32(ret1); +++ intInputVal2 = _mm_cvtps_epi32(ret2); +++ intInputVal3 = _mm_cvtps_epi32(ret3); +++ intInputVal4 = _mm_cvtps_epi32(ret4); +++ +++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); +++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); +++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); +++ +++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); +++ outputVectorPtr += 16; +++ } +++ +++ float scaled = 0; +++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ +++ scaled = inputVectorPtr[i]/scalar; +++ if(scaled > max_val) +++ scaled = max_val; +++ else if(scaled < min_val) +++ scaled = min_val; +++ outputVectorPtr[i] = (int8_t)rintf(scaled); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) +++ \param inputVector The floating point input data buffer +++ \param outputVector The 16 bit output data buffer +++ \param num_points The number of data values to be converted +++ */ +++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ +++ float* inputVectorPtr = (float*)inputVector; +++ int8_t* outputVectorPtr = (int8_t*)outputVector; +++ float scaled = 0; +++ float min_val = -128; +++ float max_val = 127; +++ +++ for(unsigned int i = 0; i < num_points*2; i++){ +++ scaled = inputVectorPtr[i]/scalar; +++ if(scaled > max_val) +++ scaled = max_val; +++ else if(scaled < min_val) +++ scaled = min_val; +++ outputVectorPtr[i] = (int8_t)rintf(scaled); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,266 @@ +++/*! +++ * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc +++ * \brief Volk protokernel: replaces the tracking function for update_local_code +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that replaces the tracking function for update_local_code +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H +++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ /*! +++ \brief Takes the conjugate of a complex vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ +++ +++// float* pointer1 = (float*)&d_very_early_late_spc_chips; +++// *pointer1 = 1; +++// float* pointer2 = (float*)&code_length_half_chips; +++// *pointer2 = 6; +++// float* pointer3 = (float*)&code_phase_step_half_chips; +++// *pointer3 = 7; +++// float* pointer4 = (float*)&tcode_half_chips_input; +++// *pointer4 = 8; +++ +++ const unsigned int sse_iters = num_points / 4; +++ +++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; +++ +++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); +++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); +++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); +++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); +++ __m128 twos = _mm_set1_ps (2); +++ __m128i associated_chip_index_array_int; +++ +++ __VOLK_ATTR_ALIGNED(16) int32_t output[4]; +++ +++ for (unsigned int i = 0; i < sse_iters; i++) +++ { +++ //fmod = numer - tquot * denom; tquot = numer/denom truncated +++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); +++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); +++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); +++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); +++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); +++ +++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); +++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); +++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); +++ _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int); +++ +++ //d_very_early_code[i] = d_ca_code[associated_chip_index]; +++ *d_very_early_code++ = d_ca_code[output[0]]; +++ *d_very_early_code++ = d_ca_code[output[1]]; +++ *d_very_early_code++ = d_ca_code[output[2]]; +++ *d_very_early_code++ = d_ca_code[output[3]]; +++ +++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; +++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); +++ } +++ +++ if (num_points%4!=0) +++ { +++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; +++ _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); +++ +++ int associated_chip_index; +++ float tcode_half_chips = tcode_half_chips_stored[0]; +++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; +++ +++ for (unsigned int i = 0; i < num_points%4; i++) +++ { +++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); +++ d_very_early_code[i] = d_ca_code[associated_chip_index]; +++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; +++ } +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Takes the conjugate of a complex vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ +++ +++ float* pointer1 = (float*)&d_very_early_late_spc_chips; +++ *pointer1 = 1; +++ float* pointer2 = (float*)&code_length_half_chips; +++ *pointer2 = 6; +++ float* pointer3 = (float*)&code_phase_step_half_chips; +++ *pointer3 = 7; +++ float* pointer4 = (float*)&tcode_half_chips_input; +++ *pointer4 = 8; +++ +++ int associated_chip_index; +++ float tcode_half_chips = tcode_half_chips_input; +++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; +++ +++ for (unsigned int i = 0; i < num_points; i++) +++ { +++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); +++ d_very_early_code[i] = d_ca_code[associated_chip_index]; +++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H */ +++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H +++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++ /*! +++ \brief Takes the conjugate of a complex vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ +++ +++ // float* pointer1 = (float*)&d_very_early_late_spc_chips; +++ // *pointer1 = 1; +++ // float* pointer2 = (float*)&code_length_half_chips; +++ // *pointer2 = 6; +++ // float* pointer3 = (float*)&code_phase_step_half_chips; +++ // *pointer3 = 7; +++ // float* pointer4 = (float*)&tcode_half_chips_input; +++ // *pointer4 = 8; +++ +++ const unsigned int sse_iters = num_points / 4; +++ +++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; +++ +++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); +++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); +++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); +++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); +++ __m128 twos = _mm_set1_ps (2); +++ __m128i associated_chip_index_array_int; +++ +++ __VOLK_ATTR_ALIGNED(16) int32_t output[4]; +++ +++ for (unsigned int i = 0; i < sse_iters; i++) +++ { +++ //fmod = numer - tquot * denom; tquot = numer/denom truncated +++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); +++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); +++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); +++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); +++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); +++ +++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); +++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); +++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); +++ _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int); +++ +++ //d_very_early_code[i] = d_ca_code[associated_chip_index]; +++ *d_very_early_code++ = d_ca_code[output[0]]; +++ *d_very_early_code++ = d_ca_code[output[1]]; +++ *d_very_early_code++ = d_ca_code[output[2]]; +++ *d_very_early_code++ = d_ca_code[output[3]]; +++ +++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; +++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); +++ } +++ +++ if (num_points%4!=0) +++ { +++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; +++ _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); +++ +++ int associated_chip_index; +++ float tcode_half_chips = tcode_half_chips_stored[0]; +++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; +++ +++ for (unsigned int i = 0; i < num_points%4; i++) +++ { +++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); +++ d_very_early_code[i] = d_ca_code[associated_chip_index]; +++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; +++ } +++ } +++ +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Takes the conjugate of a complex vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ +++ +++ // float* pointer1 = (float*)&d_very_early_late_spc_chips; +++ // *pointer1 = 1; +++ // float* pointer2 = (float*)&code_length_half_chips; +++ // *pointer2 = 6; +++ // float* pointer3 = (float*)&code_phase_step_half_chips; +++ // *pointer3 = 7; +++ // float* pointer4 = (float*)&tcode_half_chips_input; +++ // *pointer4 = 8; +++ +++ int associated_chip_index; +++ float tcode_half_chips = tcode_half_chips_input; +++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; +++ +++ for (unsigned int i = 0; i < num_points; i++) +++ { +++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); +++ d_very_early_code[i] = d_ca_code[associated_chip_index]; +++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,178 @@ +++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H +++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Multiplies the input vector by a scalar and stores the results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector The vector to be multiplied +++ \param scalar The complex scalar to multiply aVector +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++*/ +++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x, yl, yh, z, tmp1, tmp2; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ +++ // Set up constant scalar vector +++ yl = _mm_set_ps1(lv_creal(scalar)); +++ yh = _mm_set_ps1(lv_cimag(scalar)); +++ +++ for(;number < halfPoints; number++){ +++ +++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container +++ +++ a += 2; +++ c += 2; +++ } +++ +++ if((num_points % 2) != 0) { +++ *c = (*a) * scalar; +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the input vector by a scalar and stores the results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector The vector to be multiplied +++ \param scalar The complex scalar to multiply aVector +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++*/ +++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ unsigned int number = num_points; +++ +++ // unwrap loop +++ while (number >= 8){ +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ number -= 8; +++ } +++ +++ // clean up any remaining +++ while (number-- > 0) +++ *cPtr++ = *aPtr++ * scalar; +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ +++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H +++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++ /*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x, yl, yh, z, tmp1, tmp2; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ +++ // Set up constant scalar vector +++ yl = _mm_set_ps1(lv_creal(scalar)); +++ yh = _mm_set_ps1(lv_cimag(scalar)); +++ +++ for(;number < halfPoints; number++){ +++ +++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ _mm_store_ps((float*)c,z); // Store the results back into the C container +++ +++ a += 2; +++ c += 2; +++ } +++ +++ if((num_points % 2) != 0) { +++ *c = (*a) * scalar; +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ unsigned int number = num_points; +++ +++ // unwrap loop +++ while (number >= 8){ +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ number -= 8; +++ } +++ +++ // clean up any remaining +++ while (number-- > 0) +++ *cPtr++ = *aPtr++ * scalar; +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,763 @@ +++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H +++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H +++ +++#include +++#include +++#include +++#include +++ +++ +++#ifdef LV_HAVE_GENERIC +++ +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ float * res = (float*) result; +++ float * in = (float*) input; +++ float * tp = (float*) taps; +++ unsigned int n_2_ccomplex_blocks = num_points/2; +++ unsigned int isodd = num_points & 1; +++ +++ float sum0[2] = {0,0}; +++ float sum1[2] = {0,0}; +++ unsigned int i = 0; +++ +++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { +++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; +++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; +++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; +++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; +++ +++ in += 4; +++ tp += 4; +++ } +++ +++ res[0] = sum0[0] + sum1[0]; +++ res[1] = sum0[1] + sum1[1]; +++ +++ // Cleanup if we had an odd number of points +++ for(i = 0; i < isodd; ++i) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++ +++ +++#if LV_HAVE_SSE && LV_HAVE_64 +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ const unsigned int num_bytes = num_points*8; +++ unsigned int isodd = num_points & 1; +++ +++ asm +++ ( +++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" +++ "# const float *taps, unsigned num_bytes)\n\t" +++ "# float sum0 = 0;\n\t" +++ "# float sum1 = 0;\n\t" +++ "# float sum2 = 0;\n\t" +++ "# float sum3 = 0;\n\t" +++ "# do {\n\t" +++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" +++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" +++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" +++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" +++ "# input += 4;\n\t" +++ "# taps += 4; \n\t" +++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" +++ "# result[0] = sum0 + sum2;\n\t" +++ "# result[1] = sum1 + sum3;\n\t" +++ "# TODO: prefetch and better scheduling\n\t" +++ " xor %%r9, %%r9\n\t" +++ " xor %%r10, %%r10\n\t" +++ " movq %%rcx, %%rax\n\t" +++ " movq %%rcx, %%r8\n\t" +++ " movq %[rsi], %%r9\n\t" +++ " movq %[rdx], %%r10\n\t" +++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" +++ " movups 0(%%r9), %%xmm0\n\t" +++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" +++ " movups 0(%%r10), %%xmm2\n\t" +++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" +++ " shr $4, %%r8\n\t" +++ " jmp .%=L1_test\n\t" +++ " # 4 taps / loop\n\t" +++ " # something like ?? cycles / loop\n\t" +++ ".%=Loop1: \n\t" +++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" +++ "# movups (%%r9), %%xmmA\n\t" +++ "# movups (%%r10), %%xmmB\n\t" +++ "# movups %%xmmA, %%xmmZ\n\t" +++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" +++ "# mulps %%xmmB, %%xmmA\n\t" +++ "# mulps %%xmmZ, %%xmmB\n\t" +++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" +++ "# xorps %%xmmPN, %%xmmA\n\t" +++ "# movups %%xmmA, %%xmmZ\n\t" +++ "# unpcklps %%xmmB, %%xmmA\n\t" +++ "# unpckhps %%xmmB, %%xmmZ\n\t" +++ "# movups %%xmmZ, %%xmmY\n\t" +++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" +++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" +++ "# addps %%xmmZ, %%xmmA\n\t" +++ "# addps %%xmmA, %%xmmC\n\t" +++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" +++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" +++ " movups 16(%%r9), %%xmm1\n\t" +++ " movups %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " movups 16(%%r10), %%xmm3\n\t" +++ " movups %%xmm1, %%xmm5\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm3, %%xmm1\n\t" +++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" +++ " addps %%xmm1, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " movups 32(%%r9), %%xmm0\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ " mulps %%xmm5, %%xmm3\n\t" +++ " add $32, %%r9\n\t" +++ " movups 32(%%r10), %%xmm2\n\t" +++ " addps %%xmm3, %%xmm7\n\t" +++ " add $32, %%r10\n\t" +++ ".%=L1_test:\n\t" +++ " dec %%rax\n\t" +++ " jge .%=Loop1\n\t" +++ " # We've handled the bulk of multiplies up to here.\n\t" +++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" +++ " # If so, we've got 2 more taps to do.\n\t" +++ " and $1, %%r8\n\t" +++ " je .%=Leven\n\t" +++ " # The count was odd, do 2 more taps.\n\t" +++ " # Note that we've already got mm0/mm2 preloaded\n\t" +++ " # from the main loop.\n\t" +++ " movups %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ ".%=Leven:\n\t" +++ " # neg inversor\n\t" +++ " xorps %%xmm1, %%xmm1\n\t" +++ " mov $0x80000000, %%r9\n\t" +++ " movd %%r9, %%xmm1\n\t" +++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" +++ " # pfpnacc\n\t" +++ " xorps %%xmm1, %%xmm6\n\t" +++ " movups %%xmm6, %%xmm2\n\t" +++ " unpcklps %%xmm7, %%xmm6\n\t" +++ " unpckhps %%xmm7, %%xmm2\n\t" +++ " movups %%xmm2, %%xmm3\n\t" +++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" +++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" +++ " addps %%xmm2, %%xmm6\n\t" +++ " # xmm6 = r1 i2 r3 i4\n\t" +++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" +++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" +++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" +++ : +++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) +++ :"rax", "r8", "r9", "r10" +++ ); +++ +++ +++ if(isodd) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } +++ +++ return; +++ +++} +++ +++#endif /* LV_HAVE_SSE && LV_HAVE_64 */ +++ +++ +++ +++ +++#ifdef LV_HAVE_SSE3 +++ +++#include +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ lv_32fc_t dotProduct; +++ memset(&dotProduct, 0x0, 2*sizeof(float)); +++ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points/2; +++ unsigned int isodd = num_points & 1; +++ +++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; +++ +++ const lv_32fc_t* a = input; +++ const lv_32fc_t* b = taps; +++ +++ dotProdVal = _mm_setzero_ps(); +++ +++ for(;number < halfPoints; number++){ +++ +++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together +++ +++ a += 2; +++ b += 2; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; +++ +++ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector +++ +++ dotProduct += ( dotProductVector[0] + dotProductVector[1] ); +++ +++ if(isodd) { +++ dotProduct += input[num_points - 1] * taps[num_points - 1]; +++ } +++ +++ *result = dotProduct; +++} +++ +++#endif /*LV_HAVE_SSE3*/ +++ +++#ifdef LV_HAVE_SSE4_1 +++ +++#include +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ unsigned int i = 0; +++ const unsigned int qtr_points = num_points/4; +++ const unsigned int isodd = num_points & 3; +++ +++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; +++ float *p_input, *p_taps; +++ __m64 *p_result; +++ +++ p_result = (__m64*)result; +++ p_input = (float*)input; +++ p_taps = (float*)taps; +++ +++ static const __m128i neg = {0x000000000000000080000000}; +++ +++ real0 = _mm_setzero_ps(); +++ real1 = _mm_setzero_ps(); +++ im0 = _mm_setzero_ps(); +++ im1 = _mm_setzero_ps(); +++ +++ for(; i < qtr_points; ++i) { +++ xmm0 = _mm_loadu_ps(p_input); +++ xmm1 = _mm_loadu_ps(p_taps); +++ +++ p_input += 4; +++ p_taps += 4; +++ +++ xmm2 = _mm_loadu_ps(p_input); +++ xmm3 = _mm_loadu_ps(p_taps); +++ +++ p_input += 4; +++ p_taps += 4; +++ +++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2); +++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3); +++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2); +++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3); +++ +++ //imaginary vector from input +++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4); +++ //real vector from input +++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4); +++ //imaginary vector from taps +++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5); +++ //real vector from taps +++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5); +++ +++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); +++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); +++ +++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); +++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); +++ +++ real0 = _mm_add_ps(xmm4, real0); +++ real1 = _mm_add_ps(xmm5, real1); +++ im0 = _mm_add_ps(xmm6, im0); +++ im1 = _mm_add_ps(xmm7, im1); +++ } +++ +++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); +++ +++ im0 = _mm_add_ps(im0, im1); +++ real0 = _mm_add_ps(real0, real1); +++ +++ im0 = _mm_add_ps(im0, real0); +++ +++ _mm_storel_pi(p_result, im0); +++ +++ for(i = num_points-isodd; i < num_points; i++) { +++ *result += input[i] * taps[i]; +++ } +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++ +++ +++ +++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/ +++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H +++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H +++ +++#include +++#include +++#include +++#include +++ +++ +++#ifdef LV_HAVE_GENERIC +++ +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ const unsigned int num_bytes = num_points*8; +++ +++ float * res = (float*) result; +++ float * in = (float*) input; +++ float * tp = (float*) taps; +++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4; +++ unsigned int isodd = num_points & 1; +++ +++ float sum0[2] = {0,0}; +++ float sum1[2] = {0,0}; +++ unsigned int i = 0; +++ +++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { +++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; +++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; +++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; +++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; +++ +++ in += 4; +++ tp += 4; +++ } +++ +++ res[0] = sum0[0] + sum1[0]; +++ res[1] = sum0[1] + sum1[1]; +++ +++ for(i = 0; i < isodd; ++i) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++ +++#if LV_HAVE_SSE && LV_HAVE_64 +++ +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ const unsigned int num_bytes = num_points*8; +++ unsigned int isodd = num_points & 1; +++ +++ asm +++ ( +++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" +++ "# const float *taps, unsigned num_bytes)\n\t" +++ "# float sum0 = 0;\n\t" +++ "# float sum1 = 0;\n\t" +++ "# float sum2 = 0;\n\t" +++ "# float sum3 = 0;\n\t" +++ "# do {\n\t" +++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" +++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" +++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" +++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" +++ "# input += 4;\n\t" +++ "# taps += 4; \n\t" +++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" +++ "# result[0] = sum0 + sum2;\n\t" +++ "# result[1] = sum1 + sum3;\n\t" +++ "# TODO: prefetch and better scheduling\n\t" +++ " xor %%r9, %%r9\n\t" +++ " xor %%r10, %%r10\n\t" +++ " movq %%rcx, %%rax\n\t" +++ " movq %%rcx, %%r8\n\t" +++ " movq %[rsi], %%r9\n\t" +++ " movq %[rdx], %%r10\n\t" +++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" +++ " movaps 0(%%r9), %%xmm0\n\t" +++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" +++ " movaps 0(%%r10), %%xmm2\n\t" +++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" +++ " shr $4, %%r8\n\t" +++ " jmp .%=L1_test\n\t" +++ " # 4 taps / loop\n\t" +++ " # something like ?? cycles / loop\n\t" +++ ".%=Loop1: \n\t" +++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" +++ "# movaps (%%r9), %%xmmA\n\t" +++ "# movaps (%%r10), %%xmmB\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" +++ "# mulps %%xmmB, %%xmmA\n\t" +++ "# mulps %%xmmZ, %%xmmB\n\t" +++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" +++ "# xorps %%xmmPN, %%xmmA\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# unpcklps %%xmmB, %%xmmA\n\t" +++ "# unpckhps %%xmmB, %%xmmZ\n\t" +++ "# movaps %%xmmZ, %%xmmY\n\t" +++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" +++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" +++ "# addps %%xmmZ, %%xmmA\n\t" +++ "# addps %%xmmA, %%xmmC\n\t" +++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" +++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" +++ " movaps 16(%%r9), %%xmm1\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " movaps 16(%%r10), %%xmm3\n\t" +++ " movaps %%xmm1, %%xmm5\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm3, %%xmm1\n\t" +++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" +++ " addps %%xmm1, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " movaps 32(%%r9), %%xmm0\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ " mulps %%xmm5, %%xmm3\n\t" +++ " add $32, %%r9\n\t" +++ " movaps 32(%%r10), %%xmm2\n\t" +++ " addps %%xmm3, %%xmm7\n\t" +++ " add $32, %%r10\n\t" +++ ".%=L1_test:\n\t" +++ " dec %%rax\n\t" +++ " jge .%=Loop1\n\t" +++ " # We've handled the bulk of multiplies up to here.\n\t" +++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" +++ " # If so, we've got 2 more taps to do.\n\t" +++ " and $1, %%r8\n\t" +++ " je .%=Leven\n\t" +++ " # The count was odd, do 2 more taps.\n\t" +++ " # Note that we've already got mm0/mm2 preloaded\n\t" +++ " # from the main loop.\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ ".%=Leven:\n\t" +++ " # neg inversor\n\t" +++ " xorps %%xmm1, %%xmm1\n\t" +++ " mov $0x80000000, %%r9\n\t" +++ " movd %%r9, %%xmm1\n\t" +++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" +++ " # pfpnacc\n\t" +++ " xorps %%xmm1, %%xmm6\n\t" +++ " movaps %%xmm6, %%xmm2\n\t" +++ " unpcklps %%xmm7, %%xmm6\n\t" +++ " unpckhps %%xmm7, %%xmm2\n\t" +++ " movaps %%xmm2, %%xmm3\n\t" +++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" +++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" +++ " addps %%xmm2, %%xmm6\n\t" +++ " # xmm6 = r1 i2 r3 i4\n\t" +++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" +++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" +++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" +++ : +++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) +++ :"rax", "r8", "r9", "r10" +++ ); +++ +++ +++ if(isodd) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } +++ +++ return; +++ +++} +++ +++#endif +++ +++#if LV_HAVE_SSE && LV_HAVE_32 +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); +++ +++#if 0 +++ const unsigned int num_bytes = num_points*8; +++ unsigned int isodd = num_points & 1; +++ +++ asm volatile +++ ( +++ " #pushl %%ebp\n\t" +++ " #movl %%esp, %%ebp\n\t" +++ " movl 12(%%ebp), %%eax # input\n\t" +++ " movl 16(%%ebp), %%edx # taps\n\t" +++ " movl 20(%%ebp), %%ecx # n_bytes\n\t" +++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" +++ " movaps 0(%%eax), %%xmm0\n\t" +++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" +++ " movaps 0(%%edx), %%xmm2\n\t" +++ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" +++ " jmp .%=L1_test\n\t" +++ " # 4 taps / loop\n\t" +++ " # something like ?? cycles / loop\n\t" +++ ".%=Loop1: \n\t" +++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" +++ "# movaps (%%eax), %%xmmA\n\t" +++ "# movaps (%%edx), %%xmmB\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" +++ "# mulps %%xmmB, %%xmmA\n\t" +++ "# mulps %%xmmZ, %%xmmB\n\t" +++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" +++ "# xorps %%xmmPN, %%xmmA\n\t" +++ "# movaps %%xmmA, %%xmmZ\n\t" +++ "# unpcklps %%xmmB, %%xmmA\n\t" +++ "# unpckhps %%xmmB, %%xmmZ\n\t" +++ "# movaps %%xmmZ, %%xmmY\n\t" +++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" +++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" +++ "# addps %%xmmZ, %%xmmA\n\t" +++ "# addps %%xmmA, %%xmmC\n\t" +++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" +++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" +++ " movaps 16(%%eax), %%xmm1\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " movaps 16(%%edx), %%xmm3\n\t" +++ " movaps %%xmm1, %%xmm5\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm3, %%xmm1\n\t" +++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" +++ " addps %%xmm1, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " movaps 32(%%eax), %%xmm0\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ " mulps %%xmm5, %%xmm3\n\t" +++ " addl $32, %%eax\n\t" +++ " movaps 32(%%edx), %%xmm2\n\t" +++ " addps %%xmm3, %%xmm7\n\t" +++ " addl $32, %%edx\n\t" +++ ".%=L1_test:\n\t" +++ " decl %%ecx\n\t" +++ " jge .%=Loop1\n\t" +++ " # We've handled the bulk of multiplies up to here.\n\t" +++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" +++ " # If so, we've got 2 more taps to do.\n\t" +++ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" +++ " shrl $4, %%ecx\n\t" +++ " andl $1, %%ecx\n\t" +++ " je .%=Leven\n\t" +++ " # The count was odd, do 2 more taps.\n\t" +++ " # Note that we've already got mm0/mm2 preloaded\n\t" +++ " # from the main loop.\n\t" +++ " movaps %%xmm0, %%xmm4\n\t" +++ " mulps %%xmm2, %%xmm0\n\t" +++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" +++ " addps %%xmm0, %%xmm6\n\t" +++ " mulps %%xmm4, %%xmm2\n\t" +++ " addps %%xmm2, %%xmm7\n\t" +++ ".%=Leven:\n\t" +++ " # neg inversor\n\t" +++ " movl 8(%%ebp), %%eax \n\t" +++ " xorps %%xmm1, %%xmm1\n\t" +++ " movl $0x80000000, (%%eax)\n\t" +++ " movss (%%eax), %%xmm1\n\t" +++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" +++ " # pfpnacc\n\t" +++ " xorps %%xmm1, %%xmm6\n\t" +++ " movaps %%xmm6, %%xmm2\n\t" +++ " unpcklps %%xmm7, %%xmm6\n\t" +++ " unpckhps %%xmm7, %%xmm2\n\t" +++ " movaps %%xmm2, %%xmm3\n\t" +++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" +++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" +++ " addps %%xmm2, %%xmm6\n\t" +++ " # xmm6 = r1 i2 r3 i4\n\t" +++ " #movl 8(%%ebp), %%eax # @result\n\t" +++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" +++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" +++ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" +++ " #popl %%ebp\n\t" +++ : +++ : +++ : "eax", "ecx", "edx" +++ ); +++ +++ +++ int getem = num_bytes % 16; +++ +++ if(isodd) { +++ *result += (input[num_points - 1] * taps[num_points - 1]); +++ } +++ +++ return; +++#endif +++} +++ +++#endif /*LV_HAVE_SSE*/ +++ +++#ifdef LV_HAVE_SSE3 +++ +++#include +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ const unsigned int num_bytes = num_points*8; +++ unsigned int isodd = num_points & 1; +++ +++ lv_32fc_t dotProduct; +++ memset(&dotProduct, 0x0, 2*sizeof(float)); +++ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_bytes >> 4; +++ +++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; +++ +++ const lv_32fc_t* a = input; +++ const lv_32fc_t* b = taps; +++ +++ dotProdVal = _mm_setzero_ps(); +++ +++ for(;number < halfPoints; number++){ +++ +++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together +++ +++ a += 2; +++ b += 2; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; +++ +++ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector +++ +++ dotProduct += ( dotProductVector[0] + dotProductVector[1] ); +++ +++ if(isodd) { +++ dotProduct += input[num_points - 1] * taps[num_points - 1]; +++ } +++ +++ *result = dotProduct; +++} +++ +++#endif /*LV_HAVE_SSE3*/ +++ +++#ifdef LV_HAVE_SSE4_1 +++ +++#include +++ +++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { +++ +++ unsigned int i = 0; +++ const unsigned int qtr_points = num_points/4; +++ const unsigned int isodd = num_points & 3; +++ +++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; +++ float *p_input, *p_taps; +++ __m64 *p_result; +++ +++ static const __m128i neg = {0x000000000000000080000000}; +++ +++ p_result = (__m64*)result; +++ p_input = (float*)input; +++ p_taps = (float*)taps; +++ +++ real0 = _mm_setzero_ps(); +++ real1 = _mm_setzero_ps(); +++ im0 = _mm_setzero_ps(); +++ im1 = _mm_setzero_ps(); +++ +++ for(; i < qtr_points; ++i) { +++ xmm0 = _mm_load_ps(p_input); +++ xmm1 = _mm_load_ps(p_taps); +++ +++ p_input += 4; +++ p_taps += 4; +++ +++ xmm2 = _mm_load_ps(p_input); +++ xmm3 = _mm_load_ps(p_taps); +++ +++ p_input += 4; +++ p_taps += 4; +++ +++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2); +++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3); +++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2); +++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3); +++ +++ //imaginary vector from input +++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4); +++ //real vector from input +++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4); +++ //imaginary vector from taps +++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5); +++ //real vector from taps +++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5); +++ +++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); +++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); +++ +++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); +++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); +++ +++ real0 = _mm_add_ps(xmm4, real0); +++ real1 = _mm_add_ps(xmm5, real1); +++ im0 = _mm_add_ps(xmm6, im0); +++ im1 = _mm_add_ps(xmm7, im1); +++ } +++ +++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); +++ +++ im0 = _mm_add_ps(im0, im1); +++ real0 = _mm_add_ps(real0, real1); +++ +++ im0 = _mm_add_ps(im0, real0); +++ +++ _mm_storel_pi(p_result, im0); +++ +++ for(i = num_points-isodd; i < num_points; i++) { +++ *result += input[i] * taps[i]; +++ } +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,170 @@ +++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H +++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++ /*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x, y, yl, yh, z, tmp1, tmp2; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ +++ for(;number < halfPoints; number++){ +++ +++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container +++ +++ a += 2; +++ b += 2; +++ c += 2; +++ } +++ +++ if((num_points % 2) != 0) { +++ *c = (*a) * (*b); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr= bVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ +++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H +++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++ /*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ __m128 x, y, yl, yh, z, tmp1, tmp2; +++ lv_32fc_t* c = cVector; +++ const lv_32fc_t* a = aVector; +++ const lv_32fc_t* b = bVector; +++ for(;number < halfPoints; number++){ +++ +++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ _mm_store_ps((float*)c,z); // Store the results back into the C container +++ +++ a += 2; +++ b += 2; +++ c += 2; +++ } +++ +++ if((num_points % 2) != 0) { +++ *c = (*a) * (*b); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++ /*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +++ lv_32fc_t* cPtr = cVector; +++ const lv_32fc_t* aPtr = aVector; +++ const lv_32fc_t* bPtr= bVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++ /*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); +++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ +++ volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); +++} +++#endif /* LV_HAVE_ORC */ +++ +++ +++ +++ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,409 @@ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++/*! +++ * TODO: Code the SSE4 version and benchmark it +++ */ +++#ifdef LV_HAVE_SSE3 +++#include +++ +++ +++ /*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ lv_32fc_t dotProduct_E; +++ memset(&dotProduct_E, 0x0, 2*sizeof(float)); +++ lv_32fc_t dotProduct_P; +++ memset(&dotProduct_P, 0x0, 2*sizeof(float)); +++ lv_32fc_t dotProduct_L; +++ memset(&dotProduct_L, 0x0, 2*sizeof(float)); +++ +++ // Aux vars +++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; +++ +++ z_E = _mm_setzero_ps(); +++ z_P = _mm_setzero_ps(); +++ z_L = _mm_setzero_ps(); +++ +++ //input and output vectors +++ //lv_32fc_t* _input_BB = input_BB; +++ const lv_32fc_t* _input = input; +++ const lv_32fc_t* _carrier = carrier; +++ const lv_32fc_t* _E_code = E_code; +++ const lv_32fc_t* _P_code = P_code; +++ const lv_32fc_t* _L_code = L_code; +++ +++ for(;number < halfPoints; number++) +++ { +++ // carrier wipe-off (vector point-to-point product) +++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container +++ +++ // correlation E,P,L (3x vector scalar product) +++ // Early +++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi +++ x = z; +++ +++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together +++ +++ // Prompt +++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together +++ +++ // Late +++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together +++ +++ /*pointer increment*/ +++ _carrier += 2; +++ _input += 2; +++ //_input_BB += 2; +++ _E_code += 2; +++ _P_code += 2; +++ _L_code +=2; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; +++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; +++ +++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector +++ +++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); +++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); +++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); +++ +++ if((num_points % 2) != 0) +++ { +++ //_input_BB = (*_input) * (*_carrier); +++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); +++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); +++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); +++ } +++ +++ *E_out = dotProduct_E; +++ *P_out = dotProduct_P; +++ *L_out = dotProduct_L; +++} +++ +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) +++{ +++ lv_32fc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get early, late, and prompt values for each +++ *E_out += bb_signal_sample * E_code[i]; +++ *P_out += bb_signal_sample * P_code[i]; +++ *L_out += bb_signal_sample * L_code[i]; +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ lv_32fc_t dotProduct_E; +++ memset(&dotProduct_E, 0x0, 2*sizeof(float)); +++ lv_32fc_t dotProduct_P; +++ memset(&dotProduct_P, 0x0, 2*sizeof(float)); +++ lv_32fc_t dotProduct_L; +++ memset(&dotProduct_L, 0x0, 2*sizeof(float)); +++ +++ // Aux vars +++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; +++ +++ z_E = _mm_setzero_ps(); +++ z_P = _mm_setzero_ps(); +++ z_L = _mm_setzero_ps(); +++ +++ //input and output vectors +++ //lv_32fc_t* _input_BB = input_BB; +++ const lv_32fc_t* _input = input; +++ const lv_32fc_t* _carrier = carrier; +++ const lv_32fc_t* _E_code = E_code; +++ const lv_32fc_t* _P_code = P_code; +++ const lv_32fc_t* _L_code = L_code; +++ +++ for(;number < halfPoints; number++) +++ { +++ // carrier wipe-off (vector point-to-point product) +++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container +++ +++ // correlation E,P,L (3x vector scalar product) +++ // Early +++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi +++ x = z; +++ +++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together +++ +++ // Prompt +++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together +++ +++ // Late +++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ +++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together +++ +++ /*pointer increment*/ +++ _carrier += 2; +++ _input += 2; +++ //_input_BB += 2; +++ _E_code += 2; +++ _P_code += 2; +++ _L_code +=2; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; +++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; +++ +++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector +++ +++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); +++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); +++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); +++ +++ if((num_points % 2) != 0) +++ { +++ //_input_BB = (*_input) * (*_carrier); +++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); +++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); +++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); +++ } +++ +++ *E_out = dotProduct_E; +++ *P_out = dotProduct_P; +++ *L_out = dotProduct_L; +++} +++ +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) +++{ +++ lv_32fc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get early, late, and prompt values for each +++ *E_out += bb_signal_sample * E_code[i]; +++ *P_out += bb_signal_sample * P_code[i]; +++ *L_out += bb_signal_sample * L_code[i]; +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,848 @@ +++/*! +++ * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5 +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors +++ * \authors
    +++ *
  • Javier Arribas, 2011. jarribas(at)cttc.es +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * VE, Early, Prompt, Late and VL correlation with 64 bits vectors (32 bits the +++ * real part and 32 bits the imaginary part): +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 64 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - VE values are calculated by multiplying the input signal in BB by the +++ * VE code (multiplication of 64 bits vectors), accumulating the results +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 64 bits vectors), accumulating the results +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 64 bits vectors), accumulating the results +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 64 bits vectors), accumulating the results +++ * - VL values are calculated by multiplying the input signal in BB by the +++ * VL code (multiplication of 64 bits vectors), accumulating the results +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include +++/*! +++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code VE PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param VL_code VL PRN code replica input +++ \param VE_out VE correlation output +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param VL_out VL correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 4; +++ +++ lv_32fc_t dotProduct_VE; +++ lv_32fc_t dotProduct_E; +++ lv_32fc_t dotProduct_P; +++ lv_32fc_t dotProduct_L; +++ lv_32fc_t dotProduct_VL; +++ +++ // Aux vars +++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; +++ __m256 bb_signal_sample, bb_signal_sample_shuffled; +++ +++ z_VE = _mm256_setzero_ps(); +++ z_E = _mm256_setzero_ps(); +++ z_P = _mm256_setzero_ps(); +++ z_L = _mm256_setzero_ps(); +++ z_VL = _mm256_setzero_ps(); +++ +++ //input and output vectors +++ const lv_32fc_t* _input = input; +++ const lv_32fc_t* _carrier = carrier; +++ const lv_32fc_t* _VE_code = VE_code; +++ const lv_32fc_t* _E_code = E_code; +++ const lv_32fc_t* _P_code = P_code; +++ const lv_32fc_t* _L_code = L_code; +++ const lv_32fc_t* _VL_code = VL_code; +++ +++ for(;number < halfPoints; number++) +++ { +++ // carrier wipe-off (vector point-to-point product) +++ x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br +++ +++ // correlation VE,E,P,L,VL (5x vector scalar product) +++ // VE +++ y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together +++ +++ // Early +++ y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together +++ +++ // Prompt +++ y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together +++ +++ // Late +++ y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together +++ +++ // VL +++ y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together +++ +++ /*pointer increment*/ +++ _carrier += 4; +++ _input += 4; +++ _VE_code += 4; +++ _E_code += 4; +++ _P_code += 4; +++ _L_code += 4; +++ _VL_code += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; +++ +++ _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector +++ _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector +++ _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector +++ _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector +++ _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector +++ +++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); +++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); +++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); +++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); +++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); +++ +++ for (int i = 0; i<(num_points % 4); ++i) +++ { +++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); +++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); +++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); +++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); +++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); +++ } +++ +++ *VE_out = dotProduct_VE; +++ *E_out = dotProduct_E; +++ *P_out = dotProduct_P; +++ *L_out = dotProduct_L; +++ *VL_out = dotProduct_VL; +++} +++#endif /* LV_HAVE_AVX */ +++ +++#ifdef LV_HAVE_SSE3 +++#include +++ /*! +++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code VE PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param VL_code VL PRN code replica input +++ \param VE_out VE correlation output +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param VL_out VL correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ lv_32fc_t dotProduct_VE; +++ lv_32fc_t dotProduct_E; +++ lv_32fc_t dotProduct_P; +++ lv_32fc_t dotProduct_L; +++ lv_32fc_t dotProduct_VL; +++ +++ // Aux vars +++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; +++ __m128 bb_signal_sample, bb_signal_sample_shuffled; +++ +++ z_VE = _mm_setzero_ps(); +++ z_E = _mm_setzero_ps(); +++ z_P = _mm_setzero_ps(); +++ z_L = _mm_setzero_ps(); +++ z_VL = _mm_setzero_ps(); +++ +++ //input and output vectors +++ const lv_32fc_t* _input = input; +++ const lv_32fc_t* _carrier = carrier; +++ const lv_32fc_t* _VE_code = VE_code; +++ const lv_32fc_t* _E_code = E_code; +++ const lv_32fc_t* _P_code = P_code; +++ const lv_32fc_t* _L_code = L_code; +++ const lv_32fc_t* _VL_code = VL_code; +++ +++ for(;number < halfPoints; number++) +++ { +++ // carrier wipe-off (vector point-to-point product) +++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br +++ +++ // correlation VE,E,P,L,VL (5x vector scalar product) +++ // VE +++ y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together +++ +++ // Early +++ y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together +++ +++ // Prompt +++ y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together +++ +++ // Late +++ y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together +++ +++ // VL +++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together +++ +++ /*pointer increment*/ +++ _carrier += 2; +++ _input += 2; +++ _VE_code += 2; +++ _E_code += 2; +++ _P_code += 2; +++ _L_code +=2; +++ _VL_code +=2; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; +++ +++ _mm_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector +++ +++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); +++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); +++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); +++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); +++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); +++ +++ if((num_points % 2) != 0) +++ { +++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); +++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); +++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); +++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); +++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); +++ } +++ +++ *VE_out = dotProduct_VE; +++ *E_out = dotProduct_E; +++ *P_out = dotProduct_P; +++ *L_out = dotProduct_L; +++ *VL_out = dotProduct_VL; +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code VE PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param VL_code VL PRN code replica input +++ \param VE_out VE correlation output +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param VL_out VL correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +++{ +++ lv_32fc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ // perform Early, Prompt and Late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get early, late, and prompt values for each +++ *VE_out += bb_signal_sample * VE_code[i]; +++ *E_out += bb_signal_sample * E_code[i]; +++ *P_out += bb_signal_sample * P_code[i]; +++ *L_out += bb_signal_sample * L_code[i]; +++ *VL_out += bb_signal_sample * VL_code[i]; +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include +++/*! +++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code VE PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param VL_code VL PRN code replica input +++ \param VE_out VE correlation output +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param VL_out VL correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 4; +++ +++ lv_32fc_t dotProduct_VE; +++ lv_32fc_t dotProduct_E; +++ lv_32fc_t dotProduct_P; +++ lv_32fc_t dotProduct_L; +++ lv_32fc_t dotProduct_VL; +++ +++ // Aux vars +++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; +++ __m256 bb_signal_sample, bb_signal_sample_shuffled; +++ +++ z_VE = _mm256_setzero_ps(); +++ z_E = _mm256_setzero_ps(); +++ z_P = _mm256_setzero_ps(); +++ z_L = _mm256_setzero_ps(); +++ z_VL = _mm256_setzero_ps(); +++ +++ //input and output vectors +++ const lv_32fc_t* _input = input; +++ const lv_32fc_t* _carrier = carrier; +++ const lv_32fc_t* _VE_code = VE_code; +++ const lv_32fc_t* _E_code = E_code; +++ const lv_32fc_t* _P_code = P_code; +++ const lv_32fc_t* _L_code = L_code; +++ const lv_32fc_t* _VL_code = VL_code; +++ +++ for(;number < halfPoints; number++) +++ { +++ // carrier wipe-off (vector point-to-point product) +++ x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br +++ +++ // correlation VE,E,P,L,VL (5x vector scalar product) +++ // VE +++ y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together +++ +++ // Early +++ y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together +++ +++ // Prompt +++ y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together +++ +++ // Late +++ y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together +++ +++ // VL +++ y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together +++ +++ /*pointer increment*/ +++ _carrier += 4; +++ _input += 4; +++ _VE_code += 4; +++ _E_code += 4; +++ _P_code += 4; +++ _L_code += 4; +++ _VL_code += 4; +++ } +++ +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; +++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; +++ +++ _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector +++ _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector +++ _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector +++ _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector +++ _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector +++ +++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); +++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); +++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); +++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); +++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); +++ +++ for (int i = 0; i<(num_points % 4); ++i) +++ { +++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); +++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); +++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); +++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); +++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); +++ } +++ +++ *VE_out = dotProduct_VE; +++ *E_out = dotProduct_E; +++ *P_out = dotProduct_P; +++ *L_out = dotProduct_L; +++ *VL_out = dotProduct_VL; +++} +++#endif /* LV_HAVE_AVX */ +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code VE PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param VL_code VL PRN code replica input +++ \param VE_out VE correlation output +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param VL_out VL correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +++{ +++ unsigned int number = 0; +++ const unsigned int halfPoints = num_points / 2; +++ +++ lv_32fc_t dotProduct_VE; +++ lv_32fc_t dotProduct_E; +++ lv_32fc_t dotProduct_P; +++ lv_32fc_t dotProduct_L; +++ lv_32fc_t dotProduct_VL; +++ +++ // Aux vars +++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; +++ __m128 bb_signal_sample, bb_signal_sample_shuffled; +++ +++ z_VE = _mm_setzero_ps(); +++ z_E = _mm_setzero_ps(); +++ z_P = _mm_setzero_ps(); +++ z_L = _mm_setzero_ps(); +++ z_VL = _mm_setzero_ps(); +++ +++ //input and output vectors +++ const lv_32fc_t* _input = input; +++ const lv_32fc_t* _carrier = carrier; +++ const lv_32fc_t* _VE_code = VE_code; +++ const lv_32fc_t* _E_code = E_code; +++ const lv_32fc_t* _P_code = P_code; +++ const lv_32fc_t* _L_code = L_code; +++ const lv_32fc_t* _VL_code = VL_code; +++ +++ for(;number < halfPoints; number++) +++ { +++ // carrier wipe-off (vector point-to-point product) +++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ +++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br +++ +++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br +++ +++ // correlation VE,E,P,L,VL (5x vector scalar product) +++ // VE +++ y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together +++ +++ // Early +++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together +++ +++ // Prompt +++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together +++ +++ // Late +++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together +++ +++ // VL +++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi +++ y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di +++ +++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr +++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di +++ +++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +++ +++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together +++ +++ /*pointer increment*/ +++ _carrier += 2; +++ _input += 2; +++ _VE_code += 2; +++ _E_code += 2; +++ _P_code += 2; +++ _L_code +=2; +++ _VL_code +=2; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; +++ +++ _mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector +++ _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector +++ +++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); +++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); +++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); +++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); +++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); +++ +++ if((num_points % 2) != 0) +++ { +++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); +++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); +++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); +++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); +++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); +++ } +++ +++ *VE_out = dotProduct_VE; +++ *E_out = dotProduct_E; +++ *P_out = dotProduct_P; +++ *L_out = dotProduct_L; +++ *VL_out = dotProduct_VL; +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code VE PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param VL_code VL PRN code replica input +++ \param VE_out VE correlation output +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param VL_out VL correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) +++{ +++ lv_32fc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ // perform Early, Prompt and Late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get early, late, and prompt values for each +++ *VE_out += bb_signal_sample * VE_code[i]; +++ *E_out += bb_signal_sample * E_code[i]; +++ *P_out += bb_signal_sample * P_code[i]; +++ *L_out += bb_signal_sample * L_code[i]; +++ *VL_out += bb_signal_sample * VL_code[i]; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,243 @@ +++/*! +++ * \file volk_gnsssdr_64f_accumulator_64f.h +++ * \brief Volk protokernel: 64 bits (double) scalar accumulator +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that implements an accumulator of char values +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H +++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){ +++ double returnValue = 0; +++ const unsigned int sse_iters = num_points / 4; +++ +++ const double* aPtr = inputBuffer; +++ +++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; +++ __m256d accumulator = _mm256_setzero_pd(); +++ __m256d aVal = _mm256_setzero_pd(); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ aVal = _mm256_loadu_pd(aPtr); +++ accumulator = _mm256_add_pd(accumulator, aVal); +++ aPtr += 4; +++ } +++ +++ _mm256_storeu_pd((double*)tempBuffer,accumulator); +++ +++ for(int i = 0; i<4; ++i){ +++ returnValue += tempBuffer[i]; +++ } +++ +++ for(int i = 0; i<(num_points % 4); ++i){ +++ returnValue += (*aPtr++); +++ } +++ +++ *result = returnValue; +++} +++#endif /* LV_HAVE_AVX */ +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){ +++ double returnValue = 0; +++ const unsigned int sse_iters = num_points / 2; +++ +++ const double* aPtr = inputBuffer; +++ +++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; +++ __m128d accumulator = _mm_setzero_pd(); +++ __m128d aVal = _mm_setzero_pd(); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ aVal = _mm_loadu_pd(aPtr); +++ accumulator = _mm_add_pd(accumulator, aVal); +++ aPtr += 2; +++ } +++ +++ _mm_storeu_pd((double*)tempBuffer,accumulator); +++ +++ for(int i = 0; i<2; ++i){ +++ returnValue += tempBuffer[i]; +++ } +++ +++ for(int i = 0; i<(num_points % 2); ++i){ +++ returnValue += (*aPtr++); +++ } +++ +++ *result = returnValue; +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){ +++ const double* aPtr = inputBuffer; +++ double returnValue = 0; +++ +++ for(unsigned int number = 0;number < num_points; number++){ +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H +++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){ +++ double returnValue = 0; +++ const unsigned int sse_iters = num_points / 4; +++ +++ const double* aPtr = inputBuffer; +++ +++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; +++ __m256d accumulator = _mm256_setzero_pd(); +++ __m256d aVal = _mm256_setzero_pd(); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ aVal = _mm256_load_pd(aPtr); +++ accumulator = _mm256_add_pd(accumulator, aVal); +++ aPtr += 4; +++ } +++ +++ _mm256_store_pd((double*)tempBuffer,accumulator); +++ +++ for(int i = 0; i<4; ++i){ +++ returnValue += tempBuffer[i]; +++ } +++ +++ for(int i = 0; i<(num_points % 4); ++i){ +++ returnValue += (*aPtr++); +++ } +++ +++ *result = returnValue; +++} +++#endif /* LV_HAVE_AVX */ +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){ +++ double returnValue = 0; +++ const unsigned int sse_iters = num_points / 2; +++ +++ const double* aPtr = inputBuffer; +++ +++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; +++ __m128d accumulator = _mm_setzero_pd(); +++ __m128d aVal = _mm_setzero_pd(); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ aVal = _mm_load_pd(aPtr); +++ accumulator = _mm_add_pd(accumulator, aVal); +++ aPtr += 2; +++ } +++ +++ _mm_store_pd((double*)tempBuffer,accumulator); +++ +++ for(int i = 0; i<2; ++i){ +++ returnValue += tempBuffer[i]; +++ } +++ +++ for(int i = 0; i<(num_points % 2); ++i){ +++ returnValue += (*aPtr++); +++ } +++ +++ *result = returnValue; +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){ +++ const double* aPtr = inputBuffer; +++ double returnValue = 0; +++ +++ for(unsigned int number = 0;number < num_points; number++){ +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */ ++\ No newline at end of file ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,183 @@ +++/*! +++ * \file volk_gnsssdr_8i_accumulator_s8i.h +++ * \brief Volk protokernel: 8 bits (char) scalar accumulator +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that implements an accumulator of char values +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H +++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){ +++ char returnValue = 0; +++ const unsigned int sse_iters = num_points / 16; +++ +++ const char* aPtr = inputBuffer; +++ +++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; +++ __m128i accumulator = _mm_setzero_si128(); +++ __m128i aVal = _mm_setzero_si128(); +++ +++ for(unsigned int number = 0; number < sse_iters; number++){ +++ aVal = _mm_lddqu_si128((__m128i*)aPtr); +++ accumulator = _mm_add_epi8(accumulator, aVal); +++ aPtr += 16; +++ } +++ _mm_storeu_si128((__m128i*)tempBuffer,accumulator); +++ +++ for(int i = 0; i<16; ++i){ +++ returnValue += tempBuffer[i]; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i){ +++ returnValue += (*aPtr++); +++ } +++ +++ *result = returnValue; +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){ +++ const char* aPtr = inputBuffer; +++ char returnValue = 0; +++ +++ for(unsigned int number = 0;number < num_points; number++){ +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H +++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){ +++ char returnValue = 0; +++ const unsigned int sse_iters = num_points / 16; +++ +++ const char* aPtr = inputBuffer; +++ +++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; +++ __m128i accumulator = _mm_setzero_si128(); +++ __m128i aVal = _mm_setzero_si128(); +++ +++ for(unsigned int number = 0; number < sse_iters; number++){ +++ aVal = _mm_load_si128((__m128i*)aPtr); +++ accumulator = _mm_add_epi8(accumulator, aVal); +++ aPtr += 16; +++ } +++ _mm_store_si128((__m128i*)tempBuffer,accumulator); +++ +++ for(int i = 0; i<16; ++i){ +++ returnValue += tempBuffer[i]; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i){ +++ returnValue += (*aPtr++); +++ } +++ +++ *result = returnValue; +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){ +++ const char* aPtr = inputBuffer; +++ char returnValue = 0; +++ +++ for(unsigned int number = 0;number < num_points; number++){ +++ returnValue += (*aPtr++); +++ } +++ *result = returnValue; +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points); +++static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){ +++ +++ short res = 0; +++ char* resc = (char*)&res; +++ resc++; +++ +++ volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points); +++ +++ *result = *resc; +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H */ +++ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,493 @@ +++/*! +++ * \file volk_gnsssdr_8i_index_max_16u.h +++ * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H +++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include "immintrin.h" +++/*! +++ \brief Returns the index of the max value in src0 +++ \param target The index of the max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 32; +++ +++ char* basePtr = (char*)src0; +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ unsigned int index = 0; +++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; +++ __m256i ones, compareResults, currentValues; +++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi; +++ +++ ones = _mm256_set1_epi8(0xFF); +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); +++ +++ lo = _mm256_castsi256_si128(currentValues); +++ hi = _mm256_extractf128_si256(currentValues,1); +++ +++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo); +++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi); +++ +++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h +++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); +++ +++ if (!_mm256_testc_si256(compareResults, ones)) +++ { +++ _mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues); +++ +++ for(int i = 0; i < 32; i++) +++ { +++ if(currentValuesBuffer[i] > max) +++ { +++ index = inputPtr - basePtr + i; +++ max = currentValuesBuffer[i]; +++ } +++ } +++ maxValues = _mm_set1_epi8(max); +++ } +++ +++ inputPtr += 32; +++ } +++ +++ for(int i = 0; i<(num_points % 32); ++i) +++ { +++ if(src0[i] > max) +++ { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_AVX*/ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++/*! +++ \brief Returns the index of the max value in src0 +++ \param target The index of the max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* basePtr = (char*)src0; +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ unsigned int index = 0; +++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; +++ __m128i maxValues, compareResults, currentValues; +++ +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm_lddqu_si128((__m128i*)inputPtr); +++ +++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); +++ +++ if (!_mm_test_all_ones(compareResults)) +++ { +++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); +++ +++ for(int i = 0; i < 16; i++) +++ { +++ if(currentValuesBuffer[i] > max) +++ { +++ index = inputPtr - basePtr + i; +++ max = currentValuesBuffer[i]; +++ } +++ } +++ maxValues = _mm_set1_epi8(max); +++ } +++ +++ inputPtr += 16; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ if(src0[i] > max) +++ { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Returns the index of the max value in src0 +++ \param target The index of the max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* basePtr = (char*)src0; +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ unsigned int index = 0; +++ unsigned short mask; +++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; +++ __m128i maxValues, compareResults, currentValues; +++ +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); +++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); +++ mask = _mm_movemask_epi8(compareResults); +++ +++ if (mask != 0xFFFF) +++ { +++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); +++ mask = ~mask; +++ int i = 0; +++ while (mask > 0) +++ { +++ if ((mask & 1) == 1) +++ { +++ if(currentValuesBuffer[i] > max) +++ { +++ index = inputPtr - basePtr + i; +++ max = currentValuesBuffer[i]; +++ } +++ } +++ i++; +++ mask >>= 1; +++ } +++ maxValues = _mm_set1_epi8(max); +++ } +++ inputPtr += 16; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ if(src0[i] > max) +++ { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_SSE2*/ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Returns the index of the max value in src0 +++ \param target The index of the max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) { +++ +++ if(num_points > 0) +++ { +++ char max = src0[0]; +++ unsigned int index = 0; +++ +++ for(unsigned int i = 1; i < num_points; ++i) +++ { +++ if(src0[i] > max) +++ { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H*/ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H +++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include "immintrin.h" +++/*! +++ \brief Returns the index of the max value in src0 +++ \param target The index of the max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 32; +++ +++ char* basePtr = (char*)src0; +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ unsigned int index = 0; +++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; +++ __m256i ones, compareResults, currentValues; +++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi; +++ +++ ones = _mm256_set1_epi8(0xFF); +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm256_load_si256((__m256i*)inputPtr); +++ +++ lo = _mm256_castsi256_si128(currentValues); +++ hi = _mm256_extractf128_si256(currentValues,1); +++ +++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo); +++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi); +++ +++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h +++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); +++ +++ if (!_mm256_testc_si256(compareResults, ones)) +++ { +++ _mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues); +++ +++ for(int i = 0; i < 32; i++) +++ { +++ if(currentValuesBuffer[i] > max) +++ { +++ index = inputPtr - basePtr + i; +++ max = currentValuesBuffer[i]; +++ } +++ } +++ maxValues = _mm_set1_epi8(max); +++ } +++ +++ inputPtr += 32; +++ } +++ +++ for(int i = 0; i<(num_points % 32); ++i) +++ { +++ if(src0[i] > max) +++ { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_AVX*/ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "emmintrin.h" +++/*! +++ \brief Returns the index of the max value in src0 +++ \param target The index of the max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* basePtr = (char*)src0; +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ unsigned int index = 0; +++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; +++ __m128i maxValues, compareResults, currentValues; +++ +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm_load_si128((__m128i*)inputPtr); +++ +++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); +++ +++ if (!_mm_test_all_ones(compareResults)) +++ { +++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); +++ +++ for(int i = 0; i < 16; i++) +++ { +++ if(currentValuesBuffer[i] > max) +++ { +++ index = inputPtr - basePtr + i; +++ max = currentValuesBuffer[i]; +++ } +++ } +++ maxValues = _mm_set1_epi8(max); +++ } +++ +++ inputPtr += 16; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ if(src0[i] > max) +++ { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++/*! +++ \brief Returns the index of the max value in src0 +++ \param target The index of the max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* basePtr = (char*)src0; +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ unsigned int index = 0; +++ unsigned short mask; +++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; +++ __m128i maxValues, compareResults, currentValues; +++ +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm_load_si128((__m128i*)inputPtr); +++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); +++ mask = _mm_movemask_epi8(compareResults); +++ +++ if (mask != 0xFFFF) +++ { +++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); +++ mask = ~mask; +++ int i = 0; +++ while (mask > 0) +++ { +++ if ((mask & 1) == 1) +++ { +++ if(currentValuesBuffer[i] > max) +++ { +++ index = inputPtr - basePtr + i; +++ max = currentValuesBuffer[i]; +++ } +++ } +++ i++; +++ mask >>= 1; +++ } +++ maxValues = _mm_set1_epi8(max); +++ } +++ inputPtr += 16; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ if(src0[i] > max) +++ { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_SSE2*/ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Returns the index of the max value in src0 +++ \param target The index of the max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_index_max_16u_a_generic(unsigned int* target, const char* src0, unsigned int num_points) { +++ +++ if(num_points > 0) +++ { +++ char max = src0[0]; +++ unsigned int index = 0; +++ +++ for(unsigned int i = 1; i < num_points; ++i) +++ { +++ if(src0[i] > max) +++ { +++ index = i; +++ max = src0[i]; +++ } +++ } +++ target[0] = index; +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,327 @@ +++/*! +++ * \file volk_gnsssdr_8i_max_s8i.h +++ * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_u_H +++#define INCLUDED_volk_gnsssdr_8i_max_s8i_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include +++/*! +++ \brief Returns the max value in src0 +++ \param target The max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; +++ __m128i maxValues, compareResults, currentValues; +++ +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); +++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); +++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); +++ inputPtr += 16; +++ } +++ +++ _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); +++ +++ for(int i = 0; i<16; ++i) +++ { +++ if(maxValuesBuffer[i] > max) +++ { +++ max = maxValuesBuffer[i]; +++ } +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ if(src0[i] > max) +++ { +++ max = src0[i]; +++ } +++ } +++ target = max; +++ } +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Returns the max value in src0 +++ \param target The max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ unsigned short mask; +++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; +++ __m128i maxValues, compareResults, currentValues; +++ +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); +++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); +++ mask = _mm_movemask_epi8(compareResults); +++ +++ if (mask != 0xFFFF) +++ { +++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); +++ mask = ~mask; +++ int i = 0; +++ while (mask > 0) +++ { +++ if ((mask & 1) == 1) +++ { +++ if(currentValuesBuffer[i] > max) +++ { +++ max = currentValuesBuffer[i]; +++ } +++ } +++ i++; +++ mask >>= 1; +++ } +++ maxValues = _mm_set1_epi8(max); +++ } +++ inputPtr += 16; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ if(src0[i] > max) +++ { +++ max = src0[i]; +++ } +++ } +++ target = max; +++ } +++} +++ +++#endif /*LV_HAVE_SSE2*/ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Returns the max value in src0 +++ \param target The max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) { +++ if(num_points > 0) +++ { +++ char max = src0[0]; +++ +++ for(unsigned int i = 1; i < num_points; ++i) +++ { +++ if(src0[i] > max) +++ { +++ max = src0[i]; +++ } +++ } +++ target = max; +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_u_H*/ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_a_H +++#define INCLUDED_volk_gnsssdr_8i_max_s8i_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Returns the max value in src0 +++ \param target The max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; +++ __m128i maxValues, compareResults, currentValues; +++ +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm_load_si128((__m128i*)inputPtr); +++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); +++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); +++ inputPtr += 16; +++ } +++ +++ _mm_store_si128((__m128i*)maxValuesBuffer, maxValues); +++ +++ for(int i = 0; i<16; ++i) +++ { +++ if(maxValuesBuffer[i] > max) +++ { +++ max = maxValuesBuffer[i]; +++ } +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ if(src0[i] > max) +++ { +++ max = src0[i]; +++ } +++ } +++ target = max; +++ } +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++/*! +++ \brief Returns the max value in src0 +++ \param target The max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) { +++ if(num_points > 0){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* inputPtr = (char*)src0; +++ char max = src0[0]; +++ unsigned short mask; +++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; +++ __m128i maxValues, compareResults, currentValues; +++ +++ maxValues = _mm_set1_epi8(max); +++ +++ for(unsigned int number = 0; number < sse_iters; number++) +++ { +++ currentValues = _mm_load_si128((__m128i*)inputPtr); +++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); +++ mask = _mm_movemask_epi8(compareResults); +++ +++ if (mask != 0xFFFF) +++ { +++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); +++ mask = ~mask; +++ int i = 0; +++ while (mask > 0) +++ { +++ if ((mask & 1) == 1) +++ { +++ if(currentValuesBuffer[i] > max) +++ { +++ max = currentValuesBuffer[i]; +++ } +++ } +++ i++; +++ mask >>= 1; +++ } +++ maxValues = _mm_set1_epi8(max); +++ } +++ inputPtr += 16; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ if(src0[i] > max) +++ { +++ max = src0[i]; +++ } +++ } +++ target = max; +++ } +++} +++ +++#endif /*LV_HAVE_SSE2*/ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Returns the max value in src0 +++ \param target The max value in src0 +++ \param src0 The buffer of data to be analysed +++ \param num_points The number of values in src0 to be analysed +++ */ +++static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) { +++ if(num_points > 0) +++ { +++ if(num_points > 0) +++ { +++ char max = src0[0]; +++ +++ for(unsigned int i = 1; i < num_points; ++i) +++ { +++ if(src0[i] > max) +++ { +++ max = src0[i]; +++ } +++ } +++ target = max; +++ } +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_a_H*/ ++\ No newline at end of file ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,184 @@ +++/*! +++ * \file volk_gnsssdr_8i_x2_add_8i.h +++ * \brief Volk protokernel: adds pairs of 8 bits (char) scalars +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that adds pairs of 8 bits (char) scalars +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H +++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H +++ +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include "pmmintrin.h" +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* cPtr = cVector; +++ const char* aPtr = aVector; +++ const char* bPtr= bVector; +++ +++ __m128i aVal, bVal, cVal; +++ +++ for(int number = 0; number < sse_iters; number++){ +++ +++ aVal = _mm_lddqu_si128((__m128i*)aPtr); +++ bVal = _mm_lddqu_si128((__m128i*)bPtr); +++ +++ cVal = _mm_add_epi8(aVal, bVal); +++ +++ _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container +++ +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ +++ char* cPtr = cVector; +++ const char* aPtr = aVector; +++ const char* bPtr= bVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H +++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H +++ +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include "pmmintrin.h" +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 16; +++ +++ char* cPtr = cVector; +++ const char* aPtr = aVector; +++ const char* bPtr= bVector; +++ +++ __m128i aVal, bVal, cVal; +++ +++ for(int number = 0; number < sse_iters; number++){ +++ +++ aVal = _mm_load_si128((__m128i*)aPtr); +++ bVal = _mm_load_si128((__m128i*)bPtr); +++ +++ cVal = _mm_add_epi8(aVal, bVal); +++ +++ _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container +++ +++ aPtr += 16; +++ bPtr += 16; +++ cPtr += 16; +++ } +++ +++ for(int i = 0; i<(num_points % 16); ++i) +++ { +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ +++ char* cPtr = cVector; +++ const char* aPtr = aVector; +++ const char* bPtr= bVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) + (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Adds the two input vectors and store their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be added +++ \param bVector One of the vectors to be added +++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector +++ */ +++extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points); +++static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ +++ volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points); +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,326 @@ +++/*! +++ * \file volk_gnsssdr_8ic_conjugate_8ic.h +++ * \brief Volk protokernel: calculates the conjugate of a 16 bits vector +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that calculates the conjugate of a +++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part) +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H +++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include "immintrin.h" +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ +++ __m256 tmp; +++ __m128i tmp128lo, tmp128hi; +++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); +++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); +++ +++ for (int i = 0; i < sse_iters; ++i) +++ { +++ tmp = _mm256_loadu_ps((float*)a); +++ tmp = _mm256_xor_ps(tmp, conjugator1); +++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); +++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); +++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); +++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); +++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h +++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1); +++ _mm256_storeu_ps((float*)c, tmp); +++ +++ a += 16; +++ c += 16; +++ } +++ +++ for (int i = 0; i<(num_points % 16); ++i) +++ { +++ *c++ = lv_conj(*a++); +++ } +++} +++#endif /* LV_HAVE_AVX */ +++ +++#ifdef LV_HAVE_SSSE3 +++#include "tmmintrin.h" +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points / 8; +++ +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ __m128i tmp; +++ +++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); +++ +++ for (int i = 0; i < sse_iters; ++i) +++ { +++ tmp = _mm_lddqu_si128((__m128i*)a); +++ tmp = _mm_sign_epi8(tmp, conjugator); +++ _mm_storeu_si128((__m128i*)c, tmp); +++ a += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = lv_conj(*a++); +++ } +++ +++} +++#endif /* LV_HAVE_SSSE3 */ +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points / 8; +++ +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ __m128i tmp; +++ +++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); +++ +++ for (int i = 0; i < sse_iters; ++i) +++ { +++ tmp = _mm_lddqu_si128((__m128i*)a); +++ tmp = _mm_xor_si128(tmp, conjugator1); +++ tmp = _mm_add_epi8(tmp, conjugator2); +++ _mm_storeu_si128((__m128i*)c, tmp); +++ a += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = lv_conj(*a++); +++ } +++ +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ lv_8sc_t* cPtr = cVector; +++ const lv_8sc_t* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = lv_conj(*aPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H +++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include "immintrin.h" +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points / 16; +++ +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ +++ __m256 tmp; +++ __m128i tmp128lo, tmp128hi; +++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); +++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); +++ +++ for (int i = 0; i < sse_iters; ++i) +++ { +++ tmp = _mm256_load_ps((float*)a); +++ tmp = _mm256_xor_ps(tmp, conjugator1); +++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); +++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); +++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); +++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); +++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h +++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1); +++ _mm256_store_ps((float*)c, tmp); +++ +++ a += 16; +++ c += 16; +++ } +++ +++ for (int i = 0; i<(num_points % 16); ++i) +++ { +++ *c++ = lv_conj(*a++); +++ } +++} +++#endif /* LV_HAVE_AVX */ +++ +++#ifdef LV_HAVE_SSSE3 +++#include "tmmintrin.h" +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points / 8; +++ +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ __m128i tmp; +++ +++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); +++ +++ for (int i = 0; i < sse_iters; ++i) +++ { +++ tmp = _mm_load_si128((__m128i*)a); +++ tmp = _mm_sign_epi8(tmp, conjugator); +++ _mm_store_si128((__m128i*)c, tmp); +++ a += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = lv_conj(*a++); +++ } +++ +++} +++#endif /* LV_HAVE_SSSE3 */ +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ const unsigned int sse_iters = num_points / 8; +++ +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ __m128i tmp; +++ +++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); +++ +++ for (int i = 0; i < sse_iters; ++i) +++ { +++ tmp = _mm_load_si128((__m128i*)a); +++ tmp = _mm_xor_si128(tmp, conjugator1); +++ tmp = _mm_add_epi8(tmp, conjugator2); +++ _mm_store_si128((__m128i*)c, tmp); +++ a += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = lv_conj(*a++); +++ } +++ +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ lv_8sc_t* cPtr = cVector; +++ const lv_8sc_t* aPtr = aVector; +++ unsigned int number = 0; +++ +++ for(number = 0; number < num_points; number++){ +++ *cPtr++ = lv_conj(*aPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Takes the conjugate of an unsigned char vector. +++ \param cVector The vector where the results will be stored +++ \param aVector Vector to be conjugated +++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector +++ */ +++extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points); +++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ +++ volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points); +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,320 @@ +++/*! +++ * \file volk_gnsssdr_8ic_magnitude_squared_8i.h +++ * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that calculates the magnitude squared of a +++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part) +++ * result = (real*real) + (imag*imag) +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H +++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++#include "tmmintrin.h" +++/*! +++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 16; +++ +++ const char* complexVectorPtr = (char*)complexVector; +++ char* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128i zero, result8; +++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; +++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; +++ +++ zero = _mm_setzero_si128(); +++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); +++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ +++ for(int number = 0;number < sse_iters; number++) +++ { +++ avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); +++ avectorlo = _mm_unpacklo_epi8 (avector, zero); +++ avectorhi = _mm_unpackhi_epi8 (avector, zero); +++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); +++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); +++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); +++ +++ complexVectorPtr += 16; +++ +++ bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); +++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero); +++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero); +++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); +++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); +++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); +++ +++ complexVectorPtr += 16; +++ +++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); +++ +++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8); +++ +++ magnitudeVectorPtr += 16; +++ +++ +++ } +++ +++ for (int i = 0; i<(num_points % 16); ++i) +++ { +++ const char valReal = *complexVectorPtr++; +++ const char valImag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); +++ } +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++//#ifdef LV_HAVE_SSE +++//#include +++///*! +++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +++// \param complexVector The vector containing the complex input values +++// \param magnitudeVector The vector containing the real output values +++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++// */ +++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +++// unsigned int number = 0; +++// const unsigned int quarterPoints = num_points / 4; +++// +++// const float* complexVectorPtr = (float*)complexVector; +++// float* magnitudeVectorPtr = magnitudeVector; +++// +++// __m128 cplxValue1, cplxValue2, iValue, qValue, result; +++// for(;number < quarterPoints; number++){ +++// cplxValue1 = _mm_loadu_ps(complexVectorPtr); +++// complexVectorPtr += 4; +++// +++// cplxValue2 = _mm_loadu_ps(complexVectorPtr); +++// complexVectorPtr += 4; +++// +++// // Arrange in i1i2i3i4 format +++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +++// // Arrange in q1q2q3q4 format +++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++// +++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values +++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +++// +++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +++// +++// _mm_storeu_ps(magnitudeVectorPtr, result); +++// magnitudeVectorPtr += 4; +++// } +++// +++// number = quarterPoints * 4; +++// for(; number < num_points; number++){ +++// float val1Real = *complexVectorPtr++; +++// float val1Imag = *complexVectorPtr++; +++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++// } +++//} +++//#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ +++ const char* complexVectorPtr = (char*)complexVector; +++ char* magnitudeVectorPtr = magnitudeVector; +++ +++ for(int number = 0; number < num_points; number++){ +++ const char real = *complexVectorPtr++; +++ const char imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H +++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 16; +++ +++ const char* complexVectorPtr = (char*)complexVector; +++ char* magnitudeVectorPtr = magnitudeVector; +++ +++ __m128i zero, result8; +++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; +++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; +++ +++ zero = _mm_setzero_si128(); +++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); +++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); +++ +++ for(int number = 0;number < sse_iters; number++) +++ { +++ avector = _mm_load_si128((__m128i*)complexVectorPtr); +++ avectorlo = _mm_unpacklo_epi8 (avector, zero); +++ avectorhi = _mm_unpackhi_epi8 (avector, zero); +++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); +++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); +++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); +++ +++ complexVectorPtr += 16; +++ +++ bvector = _mm_load_si128((__m128i*)complexVectorPtr); +++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero); +++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero); +++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); +++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); +++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); +++ +++ complexVectorPtr += 16; +++ +++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); +++ +++ _mm_store_si128((__m128i*)magnitudeVectorPtr, result8); +++ +++ magnitudeVectorPtr += 16; +++ +++ +++ } +++ +++ for (int i = 0; i<(num_points % 16); ++i) +++ { +++ const char valReal = *complexVectorPtr++; +++ const char valImag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); +++ } +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++//#ifdef LV_HAVE_SSE +++//#include +++///*! +++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +++// \param complexVector The vector containing the complex input values +++// \param magnitudeVector The vector containing the real output values +++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++// */ +++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ +++// unsigned int number = 0; +++// const unsigned int quarterPoints = num_points / 4; +++// +++// const float* complexVectorPtr = (float*)complexVector; +++// float* magnitudeVectorPtr = magnitudeVector; +++// +++// __m128 cplxValue1, cplxValue2, iValue, qValue, result; +++// for(;number < quarterPoints; number++){ +++// cplxValue1 = _mm_load_ps(complexVectorPtr); +++// complexVectorPtr += 4; +++// +++// cplxValue2 = _mm_load_ps(complexVectorPtr); +++// complexVectorPtr += 4; +++// +++// // Arrange in i1i2i3i4 format +++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); +++// // Arrange in q1q2q3q4 format +++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); +++// +++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values +++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values +++// +++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +++// +++// _mm_store_ps(magnitudeVectorPtr, result); +++// magnitudeVectorPtr += 4; +++// } +++// +++// number = quarterPoints * 4; +++// for(; number < num_points; number++){ +++// float val1Real = *complexVectorPtr++; +++// float val1Imag = *complexVectorPtr++; +++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); +++// } +++//} +++//#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ +++ const char* complexVectorPtr = (char*)complexVector; +++ char* magnitudeVectorPtr = magnitudeVector; +++ +++ for(int number = 0; number < num_points; number++){ +++ const char real = *complexVectorPtr++; +++ const char imag = *complexVectorPtr++; +++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector +++ \param complexVector The vector containing the complex input values +++ \param magnitudeVector The vector containing the real output values +++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector +++ */ +++extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points); +++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ +++ volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points); +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,271 @@ +++/*! +++ * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h +++ * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that multiplies a group of 16 bits vectors +++ * (8 bits the real part and 8 bits the imaginary part) by one constant vector +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H +++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Multiplies the input vector by a scalar and stores the results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector The vector to be multiplied +++ \param scalar The complex scalar to multiply aVector +++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; +++ +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ y = _mm_set1_epi16 (*(short*)&scalar); +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ for(int number = 0;number < sse_iters; number++){ +++ +++ x = _mm_lddqu_si128((__m128i*)a); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ realc = _mm_and_si128 (realc, mult1); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ imagc = _mm_and_si128 (imagc, mult1); +++ imagc = _mm_slli_si128 (imagc, 1); +++ +++ totalc = _mm_or_si128 (realc, imagc); +++ +++ _mm_storeu_si128((__m128i*)c, totalc); +++ +++ a += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = (*a++) * scalar; +++ } +++ +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the input vector by a scalar and stores the results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector The vector to be multiplied +++ \param scalar The complex scalar to multiply aVector +++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ +++ +++ /*lv_8sc_t* cPtr = cVector; +++ const lv_8sc_t* aPtr = aVector; +++ +++ for (int i = 0; i= 8){ +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ number -= 8; +++ } +++ +++ // clean up any remaining +++ while (number-- > 0) +++ *cPtr++ = *aPtr++ * scalar; +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H +++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++/*! +++ \brief Multiplies the input vector by a scalar and stores the results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector The vector to be multiplied +++ \param scalar The complex scalar to multiply aVector +++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; +++ +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ y = _mm_set1_epi16 (*(short*)&scalar); +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ for(int number = 0;number < sse_iters; number++){ +++ +++ x = _mm_load_si128((__m128i*)a); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ realc = _mm_and_si128 (realc, mult1); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ imagc = _mm_and_si128 (imagc, mult1); +++ imagc = _mm_slli_si128 (imagc, 1); +++ +++ totalc = _mm_or_si128 (realc, imagc); +++ +++ _mm_store_si128((__m128i*)c, totalc); +++ +++ a += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = (*a++) * scalar; +++ } +++ +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the input vector by a scalar and stores the results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector The vector to be multiplied +++ \param scalar The complex scalar to multiply aVector +++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ +++ +++ /*lv_8sc_t* cPtr = cVector; +++ const lv_8sc_t* aPtr = aVector; +++ +++ for (int i = 0; i= 8){ +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ *cPtr++ = (*aPtr++) * scalar; +++ number -= 8; +++ } +++ +++ // clean up any remaining +++ while (number-- > 0) +++ *cPtr++ = *aPtr++ * scalar; +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Multiplies the input vector by a scalar and stores the results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector The vector to be multiplied +++ \param scalar The complex scalar to multiply aVector +++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector +++ */ +++extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points); +++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ +++ volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points); +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,499 @@ +++/*! +++ * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h +++ * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part +++ * and 8 bits the imaginary part) and accumulates them +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H +++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H +++ +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector +++ \param cVector The vector where the accumulated result will be stored +++ \param aVector One of the vectors to be multiplied and accumulated +++ \param bVector One of the vectors to be multiplied and accumulated +++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { +++ +++ /*lv_8sc_t* cPtr = result; +++ const lv_8sc_t* aPtr = input; +++ const lv_8sc_t* bPtr = taps; +++ +++ for(int number = 0; number < num_points; number++){ +++ *cPtr += (*aPtr++) * (*bPtr++); +++ }*/ +++ +++ char * res = (char*) result; +++ char * in = (char*) input; +++ char * tp = (char*) taps; +++ unsigned int n_2_ccomplex_blocks = num_points/2; +++ unsigned int isodd = num_points & 1; +++ +++ char sum0[2] = {0,0}; +++ char sum1[2] = {0,0}; +++ unsigned int i = 0; +++ +++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { +++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; +++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; +++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; +++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; +++ +++ in += 4; +++ tp += 4; +++ } +++ +++ res[0] = sum0[0] + sum1[0]; +++ res[1] = sum0[1] + sum1[1]; +++ +++ // Cleanup if we had an odd number of points +++ for(i = 0; i < isodd; ++i) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++/*! +++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector +++ \param cVector The vector where the accumulated result will be stored +++ \param aVector One of the vectors to be multiplied and accumulated +++ \param bVector One of the vectors to be multiplied and accumulated +++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { +++ +++ lv_8sc_t dotProduct; +++ memset(&dotProduct, 0x0, 2*sizeof(char)); +++ +++ const lv_8sc_t* a = input; +++ const lv_8sc_t* b = taps; +++ +++ const unsigned int sse_iters = num_points/8; +++ +++ if (sse_iters>0) +++ { +++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ realcacc = _mm_setzero_si128(); +++ imagcacc = _mm_setzero_si128(); +++ +++ for(int number = 0; number < sse_iters; number++){ +++ +++ x = _mm_lddqu_si128((__m128i*)a); +++ y = _mm_lddqu_si128((__m128i*)b); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ realcacc = _mm_add_epi16 (realcacc, realc); +++ imagcacc = _mm_add_epi16 (imagcacc, imagc); +++ +++ a += 8; +++ b += 8; +++ } +++ +++ realcacc = _mm_and_si128 (realcacc, mult1); +++ imagcacc = _mm_and_si128 (imagcacc, mult1); +++ imagcacc = _mm_slli_si128 (imagcacc, 1); +++ +++ totalc = _mm_or_si128 (realcacc, imagcacc); +++ +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; +++ +++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<8; ++i) +++ { +++ dotProduct += dotProductVector[i]; +++ } +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ dotProduct += (*a++) * (*b++); +++ } +++ +++ *result = dotProduct; +++} +++ +++#endif /*LV_HAVE_SSE2*/ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector +++ \param cVector The vector where the accumulated result will be stored +++ \param aVector One of the vectors to be multiplied and accumulated +++ \param bVector One of the vectors to be multiplied and accumulated +++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { +++ +++ lv_8sc_t dotProduct; +++ memset(&dotProduct, 0x0, 2*sizeof(char)); +++ +++ const lv_8sc_t* a = input; +++ const lv_8sc_t* b = taps; +++ +++ const unsigned int sse_iters = num_points/8; +++ +++ if (sse_iters>0) +++ { +++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ realcacc = _mm_setzero_si128(); +++ imagcacc = _mm_setzero_si128(); +++ +++ for(int number = 0; number < sse_iters; number++){ +++ +++ x = _mm_lddqu_si128((__m128i*)a); +++ y = _mm_lddqu_si128((__m128i*)b); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ realcacc = _mm_add_epi16 (realcacc, realc); +++ imagcacc = _mm_add_epi16 (imagcacc, imagc); +++ +++ a += 8; +++ b += 8; +++ } +++ +++ imagcacc = _mm_slli_si128 (imagcacc, 1); +++ +++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); +++ +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; +++ +++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<8; ++i) +++ { +++ dotProduct += dotProductVector[i]; +++ } +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ dotProduct += (*a++) * (*b++); +++ } +++ +++ *result = dotProduct; +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H*/ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H +++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H +++ +++#include +++#include +++#include +++#include +++ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector +++ \param cVector The vector where the accumulated result will be stored +++ \param aVector One of the vectors to be multiplied and accumulated +++ \param bVector One of the vectors to be multiplied and accumulated +++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { +++ +++ /*lv_8sc_t* cPtr = result; +++ const lv_8sc_t* aPtr = input; +++ const lv_8sc_t* bPtr = taps; +++ +++ for(int number = 0; number < num_points; number++){ +++ *cPtr += (*aPtr++) * (*bPtr++); +++ }*/ +++ +++ char * res = (char*) result; +++ char * in = (char*) input; +++ char * tp = (char*) taps; +++ unsigned int n_2_ccomplex_blocks = num_points/2; +++ unsigned int isodd = num_points & 1; +++ +++ char sum0[2] = {0,0}; +++ char sum1[2] = {0,0}; +++ unsigned int i = 0; +++ +++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { +++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; +++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; +++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; +++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; +++ +++ in += 4; +++ tp += 4; +++ } +++ +++ res[0] = sum0[0] + sum1[0]; +++ res[1] = sum0[1] + sum1[1]; +++ +++ // Cleanup if we had an odd number of points +++ for(i = 0; i < isodd; ++i) { +++ *result += input[num_points - 1] * taps[num_points - 1]; +++ } +++} +++ +++#endif /*LV_HAVE_GENERIC*/ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++/*! +++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector +++ \param cVector The vector where the accumulated result will be stored +++ \param aVector One of the vectors to be multiplied and accumulated +++ \param bVector One of the vectors to be multiplied and accumulated +++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { +++ +++ lv_8sc_t dotProduct; +++ memset(&dotProduct, 0x0, 2*sizeof(char)); +++ +++ const lv_8sc_t* a = input; +++ const lv_8sc_t* b = taps; +++ +++ const unsigned int sse_iters = num_points/8; +++ +++ if (sse_iters>0) +++ { +++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ realcacc = _mm_setzero_si128(); +++ imagcacc = _mm_setzero_si128(); +++ +++ for(int number = 0; number < sse_iters; number++){ +++ +++ x = _mm_load_si128((__m128i*)a); +++ y = _mm_load_si128((__m128i*)b); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ realcacc = _mm_add_epi16 (realcacc, realc); +++ imagcacc = _mm_add_epi16 (imagcacc, imagc); +++ +++ a += 8; +++ b += 8; +++ } +++ +++ realcacc = _mm_and_si128 (realcacc, mult1); +++ imagcacc = _mm_and_si128 (imagcacc, mult1); +++ imagcacc = _mm_slli_si128 (imagcacc, 1); +++ +++ totalc = _mm_or_si128 (realcacc, imagcacc); +++ +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; +++ +++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<8; ++i) +++ { +++ dotProduct += dotProductVector[i]; +++ } +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ dotProduct += (*a++) * (*b++); +++ } +++ +++ *result = dotProduct; +++} +++ +++#endif /*LV_HAVE_SSE2*/ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector +++ \param cVector The vector where the accumulated result will be stored +++ \param aVector One of the vectors to be multiplied and accumulated +++ \param bVector One of the vectors to be multiplied and accumulated +++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { +++ +++ lv_8sc_t dotProduct; +++ memset(&dotProduct, 0x0, 2*sizeof(char)); +++ +++ const lv_8sc_t* a = input; +++ const lv_8sc_t* b = taps; +++ +++ const unsigned int sse_iters = num_points/8; +++ +++ if (sse_iters>0) +++ { +++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ realcacc = _mm_setzero_si128(); +++ imagcacc = _mm_setzero_si128(); +++ +++ for(int number = 0; number < sse_iters; number++){ +++ +++ x = _mm_load_si128((__m128i*)a); +++ y = _mm_load_si128((__m128i*)b); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ realcacc = _mm_add_epi16 (realcacc, realc); +++ imagcacc = _mm_add_epi16 (imagcacc, imagc); +++ +++ a += 8; +++ b += 8; +++ } +++ +++ imagcacc = _mm_slli_si128 (imagcacc, 1); +++ +++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); +++ +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; +++ +++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<8; ++i) +++ { +++ dotProduct += dotProductVector[i]; +++ } +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ dotProduct += (*a++) * (*b++); +++ } +++ +++ *result = dotProduct; +++} +++ +++#endif /*LV_HAVE_SSE4_1*/ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector +++ \param cVector The vector where the accumulated result will be stored +++ \param aVector One of the vectors to be multiplied and accumulated +++ \param bVector One of the vectors to be multiplied and accumulated +++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector +++ */ +++extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points); +++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){ +++ +++ short resReal = 0; +++ char* resRealChar = (char*)&resReal; +++ resRealChar++; +++ +++ short resImag = 0; +++ char* resImagChar = (char*)&resImag; +++ resImagChar++; +++ +++ volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points); +++ +++ *result = lv_cmake(*resRealChar, *resImagChar); +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,346 @@ +++/*! +++ * \file volk_gnsssdr_8ic_x2_multiply_8ic.h +++ * \brief Volk protokernel: multiplies two 16 bits vectors +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part +++ * and 8 bits the imaginary part) +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H +++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++/*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ for(int number = 0;number < sse_iters; number++){ +++ +++ x = _mm_lddqu_si128((__m128i*)a); +++ y = _mm_lddqu_si128((__m128i*)b); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ realc = _mm_and_si128 (realc, mult1); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ imagc = _mm_and_si128 (imagc, mult1); +++ imagc = _mm_slli_si128 (imagc, 1); +++ +++ totalc = _mm_or_si128 (realc, imagc); +++ +++ _mm_storeu_si128((__m128i*)c, totalc); +++ +++ a += 8; +++ b += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = (*a++) * (*b++); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, zero; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ +++ zero = _mm_setzero_si128(); +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ for(int number = 0;number < sse_iters; number++){ +++ +++ x = _mm_lddqu_si128((__m128i*)a); +++ y = _mm_lddqu_si128((__m128i*)b); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ imagc = _mm_slli_si128 (imagc, 1); +++ +++ totalc = _mm_blendv_epi8 (imagc, realc, mult1); +++ +++ _mm_storeu_si128((__m128i*)c, totalc); +++ +++ a += 8; +++ b += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = (*a++) * (*b++); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +++ lv_8sc_t* cPtr = cVector; +++ const lv_8sc_t* aPtr = aVector; +++ const lv_8sc_t* bPtr = bVector; +++ +++ for(int number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H +++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++/*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ for(int number = 0;number < sse_iters; number++){ +++ +++ x = _mm_load_si128((__m128i*)a); +++ y = _mm_load_si128((__m128i*)b); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ realc = _mm_and_si128 (realc, mult1); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ imagc = _mm_and_si128 (imagc, mult1); +++ imagc = _mm_slli_si128 (imagc, 1); +++ +++ totalc = _mm_or_si128 (realc, imagc); +++ +++ _mm_store_si128((__m128i*)c, totalc); +++ +++ a += 8; +++ b += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = (*a++) * (*b++); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, zero; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; +++ lv_8sc_t* c = cVector; +++ const lv_8sc_t* a = aVector; +++ const lv_8sc_t* b = bVector; +++ +++ zero = _mm_setzero_si128(); +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ for(int number = 0;number < sse_iters; number++){ +++ +++ x = _mm_load_si128((__m128i*)a); +++ y = _mm_load_si128((__m128i*)b); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ imagc = _mm_slli_si128 (imagc, 1); +++ +++ totalc = _mm_blendv_epi8 (imagc, realc, mult1); +++ +++ _mm_store_si128((__m128i*)c, totalc); +++ +++ a += 8; +++ b += 8; +++ c += 8; +++ } +++ +++ for (int i = 0; i<(num_points % 8); ++i) +++ { +++ *c++ = (*a++) * (*b++); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +++ lv_8sc_t* cPtr = cVector; +++ const lv_8sc_t* aPtr = aVector; +++ const lv_8sc_t* bPtr = bVector; +++ +++ for(int number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } +++ +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Multiplies the two input complex vectors and stores their results in the third vector +++ \param cVector The vector where the results will be stored +++ \param aVector One of the vectors to be multiplied +++ \param bVector One of the vectors to be multiplied +++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector +++ */ +++extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points); +++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ +++ volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points); +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,613 @@ +++/*! +++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the +++ * real part and 8 bits the imaginary part), and accumulates the result +++ * in 32 bits single point values, returning float32 values: +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 16 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 E_code_acc, P_code_acc, L_code_acc; +++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; +++ __m128 output_ps; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ E_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) +++ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ +++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 E_code_acc, P_code_acc, L_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 output_ps_1, output_ps_2; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ E_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) +++ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ +++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ lv_8sc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get early, late, and prompt values for each +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 E_code_acc, P_code_acc, L_code_acc; +++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; +++ __m128 output_ps; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ E_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_load_si128((__m128i*)input_ptr); +++ y = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) +++ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get early values +++ y = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); +++ +++ //Get prompt values +++ y = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); +++ +++ //Get late values +++ y = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ +++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 E_code_acc, P_code_acc, L_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 output_ps_1, output_ps_2; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ E_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_load_si128((__m128i*)input_ptr); +++ y = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) +++ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get early values +++ y = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); +++ +++ //Get prompt values +++ y = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); +++ +++ //Get late values +++ y = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ +++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ lv_8sc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get early, late, and prompt values for each +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */ ++\ No newline at end of file ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,874 @@ +++/*! +++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the +++ * real part and 8 bits the imaginary part): +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 16 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 16 bits vectors), accumulating the results +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 16 bits vectors), accumulating the results +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 16 bits vectors), accumulating the results +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++ /*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_8sc_t* E_out_ptr = E_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_8sc_t* L_out_ptr = L_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_8sc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ real_E_code_acc = _mm_setzero_si128(); +++ imag_E_code_acc = _mm_setzero_si128(); +++ real_L_code_acc = _mm_setzero_si128(); +++ imag_L_code_acc = _mm_setzero_si128(); +++ real_P_code_acc = _mm_setzero_si128(); +++ imag_P_code_acc = _mm_setzero_si128(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); +++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); +++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); +++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ E_code_ptr += 8; +++ L_code_ptr += 8; +++ P_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; +++ +++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); +++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); +++ _mm_storeu_si128((__m128i*)E_dotProductVector, output); +++ +++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); +++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); +++ _mm_storeu_si128((__m128i*)L_dotProductVector, output); +++ +++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); +++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); +++ _mm_storeu_si128((__m128i*)P_dotProductVector, output); +++ +++ for (int i = 0; i<8; ++i) +++ { +++ *E_out_ptr += E_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); +++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); +++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); +++ } +++} +++ +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_8sc_t* E_out_ptr = E_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_8sc_t* L_out_ptr = L_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_8sc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ real_E_code_acc = _mm_setzero_si128(); +++ imag_E_code_acc = _mm_setzero_si128(); +++ real_L_code_acc = _mm_setzero_si128(); +++ imag_L_code_acc = _mm_setzero_si128(); +++ real_P_code_acc = _mm_setzero_si128(); +++ imag_P_code_acc = _mm_setzero_si128(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); +++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); +++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); +++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ E_code_ptr += 8; +++ L_code_ptr += 8; +++ P_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; +++ +++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); +++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); +++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); +++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); +++ _mm_storeu_si128((__m128i*)E_dotProductVector, output); +++ +++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); +++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); +++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); +++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); +++ _mm_storeu_si128((__m128i*)L_dotProductVector, output); +++ +++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); +++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); +++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); +++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); +++ _mm_storeu_si128((__m128i*)P_dotProductVector, output); +++ +++ for (int i = 0; i<8; ++i) +++ { +++ *E_out_ptr += E_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); +++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); +++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); +++ } +++} +++ +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ lv_8sc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get early, late, and prompt values for each +++ *E_out += bb_signal_sample * E_code[i]; +++ *P_out += bb_signal_sample * P_code[i]; +++ *L_out += bb_signal_sample * L_code[i]; +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_8sc_t* E_out_ptr = E_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_8sc_t* L_out_ptr = L_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_8sc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ real_E_code_acc = _mm_setzero_si128(); +++ imag_E_code_acc = _mm_setzero_si128(); +++ real_L_code_acc = _mm_setzero_si128(); +++ imag_L_code_acc = _mm_setzero_si128(); +++ real_P_code_acc = _mm_setzero_si128(); +++ imag_P_code_acc = _mm_setzero_si128(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_load_si128((__m128i*)input_ptr); +++ y = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ //Get early values +++ y = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); +++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); +++ +++ //Get late values +++ y = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); +++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); +++ +++ //Get prompt values +++ y = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); +++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ E_code_ptr += 8; +++ L_code_ptr += 8; +++ P_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; +++ +++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); +++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); +++ _mm_store_si128((__m128i*)E_dotProductVector, output); +++ +++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); +++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); +++ _mm_store_si128((__m128i*)L_dotProductVector, output); +++ +++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); +++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); +++ _mm_store_si128((__m128i*)P_dotProductVector, output); +++ +++ for (int i = 0; i<8; ++i) +++ { +++ *E_out_ptr += E_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); +++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); +++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); +++ } +++} +++ +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_8sc_t* E_out_ptr = E_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_8sc_t* L_out_ptr = L_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_8sc_t* P_out_ptr = P_out; +++ +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ real_E_code_acc = _mm_setzero_si128(); +++ imag_E_code_acc = _mm_setzero_si128(); +++ real_L_code_acc = _mm_setzero_si128(); +++ imag_L_code_acc = _mm_setzero_si128(); +++ real_P_code_acc = _mm_setzero_si128(); +++ imag_P_code_acc = _mm_setzero_si128(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_load_si128((__m128i*)input_ptr); +++ y = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ //Get early values +++ y = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); +++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); +++ +++ //Get late values +++ y = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); +++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); +++ +++ //Get prompt values +++ y = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); +++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ E_code_ptr += 8; +++ L_code_ptr += 8; +++ P_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; +++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; +++ +++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); +++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); +++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); +++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); +++ _mm_store_si128((__m128i*)E_dotProductVector, output); +++ +++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); +++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); +++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); +++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); +++ _mm_store_si128((__m128i*)L_dotProductVector, output); +++ +++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); +++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); +++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); +++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); +++ _mm_store_si128((__m128i*)P_dotProductVector, output); +++ +++ for (int i = 0; i<8; ++i) +++ { +++ *E_out_ptr += E_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get early, late, and prompt values for each +++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); +++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); +++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); +++ } +++} +++ +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) +++{ +++ lv_8sc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ // perform Early, Prompt and Late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get early, late, and prompt values for each +++ *E_out += bb_signal_sample * E_code[i]; +++ *P_out += bb_signal_sample * P_code[i]; +++ *L_out += bb_signal_sample * L_code[i]; +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param E_code Early PRN code replica input +++ \param P_code Early PRN code replica input +++ \param L_code Early PRN code replica input +++ \param E_out Early correlation output +++ \param P_out Early correlation output +++ \param L_out Early correlation output +++ \param num_points The number of complex values in vectors +++ */ +++ +++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points); +++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points); +++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){ +++ +++ short E_out_real = 0; +++ short E_out_imag = 0; +++ char* E_out_real_c = (char*)&E_out_real; +++ E_out_real_c++; +++ char* E_out_imag_c = (char*)&E_out_imag; +++ E_out_imag_c++; +++ +++ short P_out_real = 0; +++ short P_out_imag = 0; +++ char* P_out_real_c = (char*)&P_out_real; +++ P_out_real_c++; +++ char* P_out_imag_c = (char*)&P_out_imag; +++ P_out_imag_c++; +++ +++ short L_out_real = 0; +++ short L_out_imag = 0; +++ char* L_out_real_c = (char*)&L_out_real; +++ L_out_real_c++; +++ char* L_out_imag_c = (char*)&L_out_imag; +++ L_out_imag_c++; +++ +++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points); +++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points); +++ +++ //ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that +++ //in one function the length of the code gives memory problems (bad access, segmentation fault). +++ //Also, the maximum number of accumulators that can be used is 4 (and we need 6). +++ //The "carrier wipe-off" step is done two times: one in the first function and another one in the second. +++ //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just +++ //one time. +++ +++ *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c); +++ *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c); +++ *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c); +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,797 @@ +++/*! +++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits). +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the +++ * real part and 8 bits the imaginary part), and accumulates the result +++ * in 32 bits single point values, returning float32 values: +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 16 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Very Early values are calculated by multiplying the input signal in BB by the +++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Very Late values are calculated by multiplying the input signal in BB by the +++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * +++ * ------------------------------------------------------------------------- +++ * Bits analysis +++ * +++ * input = 8 bits +++ * carrier = 8 bits +++ * XX_code = 8 bits +++ * XX_out = 8 bits +++ * bb_signal_sample = 8 bits +++ * +++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) +++ * +++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between —7 and 7 to avoid overflow (3 bits) +++ * +++ * conclusion = input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 to avoid overflow (3 bits) +++ * If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 to avoid overflow (2 bits). +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; +++ __m128 output_ps; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ *VE_out_ptr = 0; +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ *VL_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ VE_code_acc = _mm_setzero_ps(); +++ E_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) +++ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get very early values +++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); +++ +++ //Get very late values +++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; +++ +++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *VE_out_ptr += VE_dotProductVector[i]; +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *VL_out_ptr += VL_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 output_ps_1, output_ps_2; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ *VE_out_ptr = 0; +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ *VL_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ VE_code_acc = _mm_setzero_ps(); +++ E_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) +++ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get very early values +++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); +++ +++ //Get very late values +++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; +++ +++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *VE_out_ptr += VE_dotProductVector[i]; +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *VL_out_ptr += VL_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ lv_8sc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ // perform very early, Early, Prompt, Late and very late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ +++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); +++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; +++ __m128 output_ps; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ *VE_out_ptr = 0; +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ *VL_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ VE_code_acc = _mm_setzero_ps(); +++ E_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_load_si128((__m128i*)input_ptr); +++ y = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) +++ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get very early values +++ y = _mm_load_si128((__m128i*)VE_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); +++ +++ //Get early values +++ y = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); +++ +++ //Get prompt values +++ y = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); +++ +++ //Get late values +++ y = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); +++ +++ //Get very late values +++ y = _mm_load_si128((__m128i*)VL_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) +++ +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; +++ +++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *VE_out_ptr += VE_dotProductVector[i]; +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *VL_out_ptr += VL_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE2 +++#include "emmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 output_ps_1, output_ps_2; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ *VE_out_ptr = 0; +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ *VL_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ VE_code_acc = _mm_setzero_ps(); +++ E_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_load_si128((__m128i*)input_ptr); +++ y = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) +++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) +++ +++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) +++ +++ //Get very early values +++ y = _mm_load_si128((__m128i*)VE_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); +++ +++ //Get early values +++ y = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); +++ +++ //Get prompt values +++ y = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); +++ +++ //Get late values +++ y = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); +++ +++ //Get very late values +++ y = _mm_load_si128((__m128i*)VL_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) +++ +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; +++ +++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *VE_out_ptr += VE_dotProductVector[i]; +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *VL_out_ptr += VL_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ lv_8sc_t bb_signal_sample; +++ +++ bb_signal_sample = lv_cmake(0, 0); +++ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ // perform very early, Early, Prompt, Late and very late correlation +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ +++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); +++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H */ ++\ No newline at end of file ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,1520 @@ +++/*! +++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the +++ * real part and 8 bits the imaginary part), and accumulates the result +++ * in 32 bits single point values, returning float32 values: +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 16 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Very Early values are calculated by multiplying the input signal in BB by the +++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Very Late values are calculated by multiplying the input signal in BB by the +++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; +++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; +++ +++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 output_ps_1, output_ps_2; +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ *VE_out_ptr = 0; +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ *VL_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ VE_code_acc = _mm_setzero_ps(); +++ E_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ imagx = _mm_srli_si128 (x, 1); +++ imagx = _mm_and_si128 (imagx, mult1); +++ realx = _mm_and_si128 (x, mult1); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); +++ +++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ //Get very early values +++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); +++ +++ //Get very late values +++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ +++ imagy = _mm_srli_si128 (y, 1); +++ imagy = _mm_and_si128 (imagy, mult1); +++ realy = _mm_and_si128 (y, mult1); +++ +++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); +++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); +++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); +++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); +++ +++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); +++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; +++ +++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *VE_out_ptr += VE_dotProductVector[i]; +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *VL_out_ptr += VL_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; +++ __m128i mult1, output, real_output, imag_output; +++ +++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 output_ps_1, output_ps_2; +++ +++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ *VE_out_ptr = 0; +++ *E_out_ptr = 0; +++ *P_out_ptr = 0; +++ *L_out_ptr = 0; +++ *VL_out_ptr = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ VE_code_acc = _mm_setzero_ps(); +++ E_code_acc = _mm_setzero_ps(); +++ P_code_acc = _mm_setzero_ps(); +++ L_code_acc = _mm_setzero_ps(); +++ VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ x_abs = _mm_abs_epi8 (x); +++ +++ y_aux = _mm_sign_epi8 (y, x); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); +++ y_aux = _mm_sign_epi8 (y_aux, x); +++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); +++ +++ //Get very early values +++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); +++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); +++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); +++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); +++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); +++ +++ //Get very late values +++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_1 = _mm_cvtepi32_ps(output_i32); +++ +++ input_i_1 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ input_i_2 = _mm_cvtepi8_epi32(output); +++ output = _mm_srli_si128 (output, 4); +++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); +++ output_ps_2 = _mm_cvtepi32_ps(output_i32); +++ +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); +++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; +++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; +++ +++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<2; ++i) +++ { +++ *VE_out_ptr += VE_dotProductVector[i]; +++ *E_out_ptr += E_dotProductVector[i]; +++ *P_out_ptr += P_dotProductVector[i]; +++ *L_out_ptr += L_dotProductVector[i]; +++ *VL_out_ptr += VL_dotProductVector[i]; +++ } +++ } +++ +++ lv_8sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; +++ __m128i mult1, real_output, imag_output; +++ +++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; +++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ +++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); +++ __m128i rearrange_sequence = _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ float VE_out_real = 0; +++ float VE_out_imag = 0; +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ float VL_out_real = 0; +++ float VL_out_imag = 0; +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ real_VE_code_acc = _mm_setzero_ps(); +++ imag_VE_code_acc = _mm_setzero_ps(); +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ real_VL_code_acc = _mm_setzero_ps(); +++ imag_VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ x_abs = _mm_abs_epi8 (x); +++ +++ y_aux = _mm_sign_epi8 (y, x); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, x); +++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); +++ +++ //Get very early values +++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); +++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ //Get very late values +++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); +++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ VE_out_real += real_VE_dotProductVector[i]; +++ VE_out_imag += imag_VE_dotProductVector[i]; +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ VL_out_real += real_VL_dotProductVector[i]; +++ VL_out_imag += imag_VL_dotProductVector[i]; +++ } +++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; +++ __m128i real_output, imag_output; +++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; +++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ __m128i minus128control; +++ +++ __m128i minus128 = _mm_set1_epi8 (-128); +++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); +++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); +++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ float VE_out_real = 0; +++ float VE_out_imag = 0; +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ float VL_out_real = 0; +++ float VL_out_imag = 0; +++ +++ real_VE_code_acc = _mm_setzero_ps(); +++ imag_VE_code_acc = _mm_setzero_ps(); +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ real_VL_code_acc = _mm_setzero_ps(); +++ imag_VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ x_abs = _mm_abs_epi8 (x); +++ +++ y_aux = _mm_sign_epi8 (y, x); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, x); +++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); +++ +++ //Get very early values +++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ minus128control = _mm_cmpeq_epi8 (y, minus128); +++ y = _mm_sub_epi8 (y, minus128control); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); +++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ minus128control = _mm_cmpeq_epi8 (y, minus128); +++ y = _mm_sub_epi8 (y, minus128control); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ minus128control = _mm_cmpeq_epi8 (y, minus128); +++ y = _mm_sub_epi8 (y, minus128control); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ minus128control = _mm_cmpeq_epi8 (y, minus128); +++ y = _mm_sub_epi8 (y, minus128control); +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ //Get very late values +++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ minus128control = _mm_cmpeq_epi8 (y, minus128); +++ y = _mm_sub_epi8 (y, minus128control); +++ +++ +++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); +++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); +++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); +++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); +++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); +++ +++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); +++ real_output = _mm_srli_si128 (real_output, 8); +++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); +++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); +++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); +++ +++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); +++ imag_output = _mm_srli_si128 (imag_output, 8); +++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); +++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); +++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); +++ +++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); +++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ VE_out_real += real_VE_dotProductVector[i]; +++ VE_out_imag += imag_VE_dotProductVector[i]; +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ VL_out_real += real_VL_dotProductVector[i]; +++ VL_out_imag += imag_VL_dotProductVector[i]; +++ } +++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++ +++#ifdef LV_HAVE_GENERIC +++#include +++#include +++ +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ +++ +++ lv_16sc_t VE_code_value; +++ lv_16sc_t E_code_value; +++ lv_16sc_t P_code_value; +++ lv_16sc_t L_code_value; +++ lv_16sc_t VL_code_value; +++ lv_16sc_t bb_signal_sample; +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ VE_code_value = VE_code[i]; +++ E_code_value = E_code[i]; +++ P_code_value = P_code[i]; +++ L_code_value = L_code[i]; +++ VL_code_value = VL_code[i]; +++ +++ if(lv_creal(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); +++ } +++ if(lv_cimag(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); +++ } +++ +++ if(lv_creal(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); +++ } +++ if(lv_cimag(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); +++ } +++ +++ if(lv_creal(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); +++ } +++ if(lv_cimag(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); +++ } +++ +++ if(lv_creal(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); +++ } +++ if(lv_cimag(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); +++ } +++ +++ if(lv_creal(VL_code_value) == -128) +++ { +++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); +++ } +++ if(lv_cimag(VL_code_value) == -128) +++ { +++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); +++ } +++ +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); +++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); +++ } +++} +++ +++#endif /* LV_HAVE_GENERIC */ +++ +++//#ifdef LV_HAVE_GENERIC +++//#include +++//#include +++//#include +++// +++//#ifndef MAX +++//#define MAX(a,b) ((a) > (b) ? a : b) +++//#endif +++// +++//#ifndef MIN +++//#define MIN(a,b) ((a) < (b) ? a : b) +++//#endif +++// +++///*! +++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++// \param input The input signal input +++// \param carrier The carrier signal input +++// \param VE_code Very Early PRN code replica input +++// \param E_code Early PRN code replica input +++// \param P_code Prompt PRN code replica input +++// \param L_code Late PRN code replica input +++// \param VL_code Very Late PRN code replica input +++// \param VE_out Very Early correlation output +++// \param E_out Early correlation output +++// \param P_out Prompt correlation output +++// \param L_out Late correlation output +++// \param VL_out Very Late correlation output +++// \param num_points The number of complex values in vectors +++// */ +++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++//{ +++// *VE_out = 0; +++// *E_out = 0; +++// *P_out = 0; +++// *L_out = 0; +++// *VL_out = 0; +++// +++// lv_16sc_t VE_out16; +++// lv_16sc_t E_out16; +++// lv_16sc_t P_out16; +++// lv_16sc_t L_out16; +++// lv_16sc_t VL_out16; +++// +++// int32_t max = 32767; +++// int32_t min = -32768; +++// +++// int16_t real_real; +++// int16_t imag_imag; +++// int16_t real_imag; +++// int16_t imag_real; +++// int32_t out_real_32; +++// int32_t out_imag_32; +++// int16_t out_real_16; +++// int16_t out_imag_16; +++// int16_t aux1; +++// int16_t aux2; +++// +++// +++// lv_8sc_t bb_signal_sample = lv_cmake(0, 0); +++// +++// // perform very early, Early, Prompt, Late and very late correlation +++// for(int i=0; i < num_points; ++i) +++// { +++// //Perform the carrier wipe-off +++// bb_signal_sample = input[i] * carrier[i]; +++// +++// aux1 = (int16_t)lv_creal(bb_signal_sample); +++// aux2 = (int16_t)lv_creal(VE_code[i]); +++// real_real = aux1*aux2; +++// aux1 = (int16_t)lv_cimag(bb_signal_sample); +++// aux2 = (int16_t)lv_cimag(VE_code[i]); +++// imag_imag = aux1*aux2; +++// aux1 = (int16_t)lv_creal(bb_signal_sample); +++// aux2 = (int16_t)lv_cimag(VE_code[i]); +++// real_imag = aux1*aux2; +++// aux1 = (int16_t)lv_cimag(bb_signal_sample); +++// aux2 = (int16_t)lv_creal(VE_code[i]); +++// imag_real = aux1*aux2; +++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag; +++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real; +++// out_real_16 = MIN(MAX(out_real_32, min), max); +++// out_imag_16 = MIN(MAX(out_imag_32, min), max); +++// VE_out16 = lv_cmake(out_real_16, out_imag_16); +++// +++// +++// +++// if(lv_creal(L_code[i]) == -128) +++// { +++// int8_t* L_pointer = (int8_t*)&L_code[i]; +++// *L_pointer = -127; +++// } +++// if(lv_cimag(L_code[i]) == -128) +++// { +++// int8_t* L_pointer = (int8_t*)&L_code[i]; +++// L_pointer++; +++// *L_pointer = -127; +++// } +++// aux1 = (int16_t)lv_creal(bb_signal_sample); +++// aux2 = (int16_t)lv_creal(L_code[i]); +++// real_real = aux1*aux2; +++// aux1 = (int16_t)lv_cimag(bb_signal_sample); +++// aux2 = (int16_t)lv_cimag(L_code[i]); +++// imag_imag = aux1*aux2; +++// aux1 = (int16_t)lv_creal(bb_signal_sample); +++// aux2 = (int16_t)lv_cimag(L_code[i]); +++// real_imag = aux1*aux2; +++// aux1 = (int16_t)lv_cimag(bb_signal_sample); +++// aux2 = (int16_t)lv_creal(L_code[i]); +++// imag_real = aux1*aux2; +++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag; +++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real; +++// out_real_16 = MIN(MAX(out_real_32, min), max); +++// out_imag_16 = MIN(MAX(out_imag_32, min), max); +++// L_out16 = lv_cmake(out_real_16, out_imag_16); +++// +++// E_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)E_code[i]; +++// P_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)P_code[i]; +++// VL_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)VL_code[i]; +++// +++// +++// *VE_out += (lv_32fc_t) VE_out16; +++// *E_out += (lv_32fc_t) E_out16; +++// *P_out += (lv_32fc_t) P_out16; +++// *L_out += (lv_32fc_t) L_out16; +++// *VL_out += (lv_32fc_t) VL_out16; +++// +++// //error en la parte real de L con 32 muestras +++// //*L_out = lv_cmake(12, 12); +++// } +++//} +++// +++//#endif /* LV_HAVE_GENERIC */ +++ +++//#ifdef LV_HAVE_GENERIC +++///*! +++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++// \param input The input signal input +++// \param carrier The carrier signal input +++// \param VE_code Very Early PRN code replica input +++// \param E_code Early PRN code replica input +++// \param P_code Prompt PRN code replica input +++// \param L_code Late PRN code replica input +++// \param VL_code Very Late PRN code replica input +++// \param VE_out Very Early correlation output +++// \param E_out Early correlation output +++// \param P_out Prompt correlation output +++// \param L_out Late correlation output +++// \param VL_out Very Late correlation output +++// \param num_points The number of complex values in vectors +++// */ +++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++//{ +++// lv_8sc_t bb_signal_sample; +++// +++// bb_signal_sample = lv_cmake(0, 0); +++// +++// *VE_out = 0; +++// *E_out = 0; +++// *P_out = 0; +++// *L_out = 0; +++// *VL_out = 0; +++// // perform very early, Early, Prompt, Late and very late correlation +++// for(int i=0; i < num_points; ++i) +++// { +++// //Perform the carrier wipe-off +++// bb_signal_sample = input[i] * carrier[i]; +++// +++// *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); +++// *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); +++// *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); +++// *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); +++// *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); +++// } +++//} +++// +++//#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H */ ++\ No newline at end of file ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,772 @@ +++/*! +++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the +++ * real part and 8 bits the imaginary part), and accumulates the result +++ * in 32 bits single point values, returning float32 values: +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 16 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Very Early values are calculated by multiplying the input signal in BB by the +++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Very Late values are calculated by multiplying the input signal in BB by the +++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * +++ * ------------------------------------------------------------------------- +++ * Bits analysis +++ * +++ * input = 8 bits +++ * carrier = 8 bits +++ * XX_code = 8 bits +++ * XX_out16 = 16 bits +++ * bb_signal_sample = 8 bits +++ * +++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) +++ * +++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits) +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; +++ __m128i real_output, imag_output; +++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ __m128i minus128control; +++ +++ __m128i minus128 = _mm_set1_epi8 (-128); +++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); +++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); +++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ float VE_out_real = 0; +++ float VE_out_imag = 0; +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ float VL_out_real = 0; +++ float VL_out_imag = 0; +++ +++ real_VE_code_acc = _mm_setzero_ps(); +++ imag_VE_code_acc = _mm_setzero_ps(); +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ real_VL_code_acc = _mm_setzero_ps(); +++ imag_VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ x_abs = _mm_abs_epi8 (x); +++ +++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); +++ +++ //Get very early values +++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); +++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ //Get very late values +++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); +++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ VE_out_real += real_VE_dotProductVector[i]; +++ VE_out_imag += imag_VE_dotProductVector[i]; +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ VL_out_real += real_VL_dotProductVector[i]; +++ VL_out_imag += imag_VL_dotProductVector[i]; +++ } +++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); +++ } +++ +++ if(num_points%8!=0) +++ { +++ lv_16sc_t bb_signal_sample; +++ lv_16sc_t VE_code_value; +++ lv_16sc_t E_code_value; +++ lv_16sc_t P_code_value; +++ lv_16sc_t L_code_value; +++ lv_16sc_t VL_code_value; +++ +++ for(int i=0; i < num_points%8; ++i) +++ { +++ VE_code_value = *VE_code_ptr++; +++ E_code_value = *E_code_ptr++; +++ P_code_value = *P_code_ptr++; +++ L_code_value = *L_code_ptr++; +++ VL_code_value = *VL_code_ptr++; +++ +++ if(lv_creal(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); +++ } +++ if(lv_cimag(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); +++ } +++ +++ if(lv_creal(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); +++ } +++ if(lv_cimag(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); +++ } +++ +++ if(lv_creal(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); +++ } +++ if(lv_cimag(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); +++ } +++ +++ if(lv_creal(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); +++ } +++ if(lv_cimag(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); +++ } +++ +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); +++ } +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++#include +++#include +++ +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ +++ lv_16sc_t VE_code_value; +++ lv_16sc_t E_code_value; +++ lv_16sc_t P_code_value; +++ lv_16sc_t L_code_value; +++ lv_16sc_t VL_code_value; +++ lv_16sc_t bb_signal_sample; +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ VE_code_value = VE_code[i]; +++ E_code_value = E_code[i]; +++ P_code_value = P_code[i]; +++ L_code_value = L_code[i]; +++ VL_code_value = VL_code[i]; +++ +++ if(lv_creal(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); +++ } +++ if(lv_cimag(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); +++ } +++ +++ if(lv_creal(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); +++ } +++ if(lv_cimag(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); +++ } +++ +++ if(lv_creal(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); +++ } +++ if(lv_cimag(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); +++ } +++ +++ if(lv_creal(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); +++ } +++ if(lv_cimag(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); +++ } +++ +++ if(lv_creal(VL_code_value) == -128) +++ { +++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); +++ } +++ if(lv_cimag(VL_code_value) == -128) +++ { +++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); +++ } +++ +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); +++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; +++ __m128i real_output, imag_output; +++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ __m128i minus128control; +++ +++ __m128i minus128 = _mm_set1_epi8 (-128); +++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); +++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); +++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ float VE_out_real = 0; +++ float VE_out_imag = 0; +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ float VL_out_real = 0; +++ float VL_out_imag = 0; +++ +++ real_VE_code_acc = _mm_setzero_ps(); +++ imag_VE_code_acc = _mm_setzero_ps(); +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ real_VL_code_acc = _mm_setzero_ps(); +++ imag_VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_load_si128((__m128i*)input_ptr); +++ y = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ x_abs = _mm_abs_epi8 (x); +++ +++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); +++ +++ //Get very early values +++ y = _mm_load_si128((__m128i*)VE_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); +++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); +++ +++ //Get early values +++ y = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ //Get very late values +++ y = _mm_load_si128((__m128i*)VL_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); +++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; +++ +++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ VE_out_real += real_VE_dotProductVector[i]; +++ VE_out_imag += imag_VE_dotProductVector[i]; +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ VL_out_real += real_VL_dotProductVector[i]; +++ VL_out_imag += imag_VL_dotProductVector[i]; +++ } +++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); +++ } +++ +++ if(num_points%8!=0) +++ { +++ lv_16sc_t bb_signal_sample; +++ lv_16sc_t VE_code_value; +++ lv_16sc_t E_code_value; +++ lv_16sc_t P_code_value; +++ lv_16sc_t L_code_value; +++ lv_16sc_t VL_code_value; +++ +++ for(int i=0; i < num_points%8; ++i) +++ { +++ VE_code_value = *VE_code_ptr++; +++ E_code_value = *E_code_ptr++; +++ P_code_value = *P_code_ptr++; +++ L_code_value = *L_code_ptr++; +++ VL_code_value = *VL_code_ptr++; +++ +++ if(lv_creal(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); +++ } +++ if(lv_cimag(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); +++ } +++ +++ if(lv_creal(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); +++ } +++ if(lv_cimag(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); +++ } +++ +++ if(lv_creal(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); +++ } +++ if(lv_cimag(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); +++ } +++ +++ if(lv_creal(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); +++ } +++ if(lv_cimag(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); +++ } +++ +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); +++ } +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++#include +++#include +++ +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ +++ lv_16sc_t VE_code_value; +++ lv_16sc_t E_code_value; +++ lv_16sc_t P_code_value; +++ lv_16sc_t L_code_value; +++ lv_16sc_t VL_code_value; +++ lv_16sc_t bb_signal_sample; +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ VE_code_value = VE_code[i]; +++ E_code_value = E_code[i]; +++ P_code_value = P_code[i]; +++ L_code_value = L_code[i]; +++ VL_code_value = VL_code[i]; +++ +++ if(lv_creal(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); +++ } +++ if(lv_cimag(VE_code_value) == -128) +++ { +++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); +++ } +++ +++ if(lv_creal(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); +++ } +++ if(lv_cimag(E_code_value) == -128) +++ { +++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); +++ } +++ +++ if(lv_creal(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); +++ } +++ if(lv_cimag(P_code_value) == -128) +++ { +++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); +++ } +++ +++ if(lv_creal(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); +++ } +++ if(lv_cimag(L_code_value) == -128) +++ { +++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); +++ } +++ +++ if(lv_creal(VL_code_value) == -128) +++ { +++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); +++ } +++ if(lv_cimag(VL_code_value) == -128) +++ { +++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); +++ } +++ +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); +++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */ ++\ No newline at end of file ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,554 @@ +++/*! +++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h +++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that performs the carrier wipe-off mixing and the +++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the +++ * real part and 8 bits the imaginary part), and accumulates the result +++ * in 32 bits single point values, returning float32 values: +++ * - The carrier wipe-off is done by multiplying the input signal by the +++ * carrier (multiplication of 16 bits vectors) It returns the input +++ * signal in base band (BB) +++ * - Very Early values are calculated by multiplying the input signal in BB by the +++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Early values are calculated by multiplying the input signal in BB by the +++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Prompt values are calculated by multiplying the input signal in BB by the +++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Late values are calculated by multiplying the input signal in BB by the +++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * - Very Late values are calculated by multiplying the input signal in BB by the +++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values +++ * +++ * ------------------------------------------------------------------------- +++ * Bits analysis +++ * +++ * input = 8 bits +++ * carrier = 8 bits +++ * XX_code = 8 bits +++ * XX_out16 = 16 bits +++ * bb_signal_sample = 8 bits +++ * +++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) +++ * +++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits) +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; +++ __m128i real_output, imag_output; +++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ +++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); +++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); +++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ float VE_out_real = 0; +++ float VE_out_imag = 0; +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ float VL_out_real = 0; +++ float VL_out_imag = 0; +++ +++ real_VE_code_acc = _mm_setzero_ps(); +++ imag_VE_code_acc = _mm_setzero_ps(); +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ real_VL_code_acc = _mm_setzero_ps(); +++ imag_VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_lddqu_si128((__m128i*)input_ptr); +++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); +++ +++ x_abs = _mm_abs_epi8 (x); +++ +++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); +++ +++ //Get very early values +++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); +++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); +++ +++ //Get early values +++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ //Get very late values +++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); +++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; +++ +++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector +++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ VE_out_real += real_VE_dotProductVector[i]; +++ VE_out_imag += imag_VE_dotProductVector[i]; +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ VL_out_real += real_VL_dotProductVector[i]; +++ VL_out_imag += imag_VL_dotProductVector[i]; +++ } +++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++#include +++#include +++ +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ +++ lv_16sc_t bb_signal_sample; +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); +++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */ +++ +++ +++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H +++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H +++ +++#include +++#include +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_SSE4_1 +++#include "smmintrin.h" +++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" +++#include "CommonMacros/CommonMacros.h" +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; +++ __m128i real_output, imag_output; +++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; +++ __m128i input_i_1, input_i_2, output_i32; +++ __m128 real_output_ps, imag_output_ps; +++ +++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); +++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); +++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ +++ const lv_8sc_t* input_ptr = input; +++ const lv_8sc_t* carrier_ptr = carrier; +++ +++ const lv_8sc_t* VE_code_ptr = VE_code; +++ lv_32fc_t* VE_out_ptr = VE_out; +++ const lv_8sc_t* E_code_ptr = E_code; +++ lv_32fc_t* E_out_ptr = E_out; +++ const lv_8sc_t* P_code_ptr = P_code; +++ lv_32fc_t* P_out_ptr = P_out; +++ const lv_8sc_t* L_code_ptr = L_code; +++ lv_32fc_t* L_out_ptr = L_out; +++ const lv_8sc_t* VL_code_ptr = VL_code; +++ lv_32fc_t* VL_out_ptr = VL_out; +++ +++ float VE_out_real = 0; +++ float VE_out_imag = 0; +++ float E_out_real = 0; +++ float E_out_imag = 0; +++ float P_out_real = 0; +++ float P_out_imag = 0; +++ float L_out_real = 0; +++ float L_out_imag = 0; +++ float VL_out_real = 0; +++ float VL_out_imag = 0; +++ +++ real_VE_code_acc = _mm_setzero_ps(); +++ imag_VE_code_acc = _mm_setzero_ps(); +++ real_E_code_acc = _mm_setzero_ps(); +++ imag_E_code_acc = _mm_setzero_ps(); +++ real_P_code_acc = _mm_setzero_ps(); +++ imag_P_code_acc = _mm_setzero_ps(); +++ real_L_code_acc = _mm_setzero_ps(); +++ imag_L_code_acc = _mm_setzero_ps(); +++ real_VL_code_acc = _mm_setzero_ps(); +++ imag_VL_code_acc = _mm_setzero_ps(); +++ +++ if (sse_iters>0) +++ { +++ for(int number = 0;number < sse_iters; number++){ +++ +++ //Perform the carrier wipe-off +++ x = _mm_load_si128((__m128i*)input_ptr); +++ y = _mm_load_si128((__m128i*)carrier_ptr); +++ +++ x_abs = _mm_abs_epi8 (x); +++ +++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) +++ +++ imag_output = _mm_slli_si128 (imag_output, 1); +++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); +++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); +++ +++ //Get very early values +++ y = _mm_load_si128((__m128i*)VE_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); +++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); +++ +++ //Get early values +++ y = _mm_load_si128((__m128i*)E_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); +++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); +++ +++ //Get prompt values +++ y = _mm_load_si128((__m128i*)P_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); +++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); +++ +++ //Get late values +++ y = _mm_load_si128((__m128i*)L_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); +++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); +++ +++ //Get very late values +++ y = _mm_load_si128((__m128i*)VL_code_ptr); +++ +++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) +++ +++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); +++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); +++ +++ input_ptr += 8; +++ carrier_ptr += 8; +++ VE_code_ptr += 8; +++ E_code_ptr += 8; +++ P_code_ptr += 8; +++ L_code_ptr += 8; +++ VL_code_ptr += 8; +++ } +++ +++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; +++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; +++ +++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector +++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector +++ +++ for (int i = 0; i<4; ++i) +++ { +++ VE_out_real += real_VE_dotProductVector[i]; +++ VE_out_imag += imag_VE_dotProductVector[i]; +++ E_out_real += real_E_dotProductVector[i]; +++ E_out_imag += imag_E_dotProductVector[i]; +++ P_out_real += real_P_dotProductVector[i]; +++ P_out_imag += imag_P_dotProductVector[i]; +++ L_out_real += real_L_dotProductVector[i]; +++ L_out_imag += imag_L_dotProductVector[i]; +++ VL_out_real += real_VL_dotProductVector[i]; +++ VL_out_imag += imag_VL_dotProductVector[i]; +++ } +++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); +++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); +++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); +++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); +++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); +++ } +++ +++ lv_16sc_t bb_signal_sample; +++ for(int i=0; i < num_points%8; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); +++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); +++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); +++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); +++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); +++ } +++} +++#endif /* LV_HAVE_SSE4_1 */ +++ +++#ifdef LV_HAVE_GENERIC +++#include +++#include +++ +++/*! +++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation +++ \param input The input signal input +++ \param carrier The carrier signal input +++ \param VE_code Very Early PRN code replica input +++ \param E_code Early PRN code replica input +++ \param P_code Prompt PRN code replica input +++ \param L_code Late PRN code replica input +++ \param VL_code Very Late PRN code replica input +++ \param VE_out Very Early correlation output +++ \param E_out Early correlation output +++ \param P_out Prompt correlation output +++ \param L_out Late correlation output +++ \param VL_out Very Late correlation output +++ \param num_points The number of complex values in vectors +++ */ +++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) +++{ +++ *VE_out = 0; +++ *E_out = 0; +++ *P_out = 0; +++ *L_out = 0; +++ *VL_out = 0; +++ +++ lv_16sc_t bb_signal_sample; +++ +++ for(int i=0; i < num_points; ++i) +++ { +++ //Perform the carrier wipe-off +++ bb_signal_sample = input[i] * carrier[i]; +++ // Now get very early, early, prompt, late and very late values for each +++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); +++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); +++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); +++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); +++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */ ++\ No newline at end of file ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,210 @@ +++/*! +++ * \file volk_gnsssdr_8u_x2_multiply_8u.h +++ * \brief Volk protokernel: multiplies unsigned char values +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that multiplies unsigned char values (8 bits data) +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H +++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H +++ +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++#include +++/*! +++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char +++ \param cChar The unsigned char where the results will be stored +++ \param aChar One of the unsigned char to be multiplied +++ \param bChar One of the unsigned char to be multiplied +++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar +++ */ +++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 16; +++ +++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; +++ unsigned char* c = cChar; +++ const unsigned char* a = aChar; +++ const unsigned char* b = bChar; +++ +++ for(int number = 0;number < sse_iters; number++){ +++ x = _mm_lddqu_si128((__m128i*)a); +++ y = _mm_lddqu_si128((__m128i*)b); +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ x1 = _mm_srli_si128 (x, 1); +++ x1 = _mm_and_si128 (x1, mult1); +++ x2 = _mm_and_si128 (x, mult1); +++ +++ y1 = _mm_srli_si128 (y, 1); +++ y1 = _mm_and_si128 (y1, mult1); +++ y2 = _mm_and_si128 (y, mult1); +++ +++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1); +++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2); +++ +++ tmp = _mm_and_si128 (x1_mult_y1, mult1); +++ tmp1 = _mm_slli_si128 (tmp, 1); +++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1); +++ totalc = _mm_or_si128 (tmp1, tmp2); +++ +++ _mm_storeu_si128((__m128i*)c, totalc); +++ +++ a += 16; +++ b += 16; +++ c += 16; +++ } +++ +++ for (int i = 0; i<(num_points % 16); ++i) +++ { +++ *c++ = (*a++) * (*b++); +++ } +++} +++#endif /* LV_HAVE_SSE3 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char +++ \param cChar The unsigned char where the results will be stored +++ \param aChar One of the unsigned char to be multiplied +++ \param bChar One of the unsigned char to be multiplied +++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar +++ */ +++static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ +++ unsigned char* cPtr = cChar; +++ const unsigned char* aPtr = aChar; +++ const unsigned char* bPtr = bChar; +++ +++ for(int number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H +++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H +++ +++#include +++#include +++ +++#ifdef LV_HAVE_SSE3 +++#include +++#include +++/*! +++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char +++ \param cChar The unsigned char where the results will be stored +++ \param aChar One of the unsigned char to be multiplied +++ \param bChar One of the unsigned char to be multiplied +++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar +++ */ +++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ +++ +++ const unsigned int sse_iters = num_points / 16; +++ +++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; +++ unsigned char* c = cChar; +++ const unsigned char* a = aChar; +++ const unsigned char* b = bChar; +++ +++ for(int number = 0;number < sse_iters; number++){ +++ x = _mm_load_si128((__m128i*)a); +++ y = _mm_load_si128((__m128i*)b); +++ +++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); +++ x1 = _mm_srli_si128 (x, 1); +++ x1 = _mm_and_si128 (x1, mult1); +++ x2 = _mm_and_si128 (x, mult1); +++ +++ y1 = _mm_srli_si128 (y, 1); +++ y1 = _mm_and_si128 (y1, mult1); +++ y2 = _mm_and_si128 (y, mult1); +++ +++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1); +++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2); +++ +++ tmp = _mm_and_si128 (x1_mult_y1, mult1); +++ tmp1 = _mm_slli_si128 (tmp, 1); +++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1); +++ totalc = _mm_or_si128 (tmp1, tmp2); +++ +++ _mm_store_si128((__m128i*)c, totalc); +++ +++ a += 16; +++ b += 16; +++ c += 16; +++ } +++ +++ for (int i = 0; i<(num_points % 16); ++i) +++ { +++ *c++ = (*a++) * (*b++); +++ } +++} +++#endif /* LV_HAVE_SSE */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char +++ \param cChar The unsigned char where the results will be stored +++ \param aChar One of the unsigned char to be multiplied +++ \param bChar One of the unsigned char to be multiplied +++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar +++ */ +++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ +++ unsigned char* cPtr = cChar; +++ const unsigned char* aPtr = aChar; +++ const unsigned char* bPtr = bChar; +++ +++ for(int number = 0; number < num_points; number++){ +++ *cPtr++ = (*aPtr++) * (*bPtr++); +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++ +++#ifdef LV_HAVE_ORC +++/*! +++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char +++ \param cChar The unsigned char where the results will be stored +++ \param aChar One of the unsigned char to be multiplied +++ \param bChar One of the unsigned char to be multiplied +++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar +++ */ +++extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points); +++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){ +++ volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points); +++} +++#endif /* LV_HAVE_ORC */ +++ +++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H */ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h ++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 2014-10-17 01:53:55.000000000 +0200 ++@@ -0,0 +1,866 @@ +++/*! +++ * \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc +++ * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. +++ * \authors
    +++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++ *
+++ * +++ * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2007 Julien Pommier +++ * +++ * This software is provided 'as-is', without any express or implied +++ * warranty. In no event will the authors be held liable for any damages +++ * arising from the use of this software. +++ * +++ * Permission is granted to anyone to use this software for any purpose, +++ * including commercial applications, and to alter it and redistribute it +++ * freely, subject to the following restrictions: +++ * +++ * 1. The origin of this software must not be misrepresented; you must not +++ * claim that you wrote the original software. If you use this software +++ * in a product, an acknowledgment in the product documentation would be +++ * appreciated but is not required. +++ * 2. Altered source versions must be plainly marked as such, and must not be +++ * misrepresented as being the original software. +++ * 3. This notice may not be removed or altered from any source distribution. +++ * +++ *(this is the zlib license) +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2012 Giovanni Garberoglio +++ * Interdisciplinary Laboratory for Computational Science (LISC) +++ * Fondazione Bruno Kessler and University of Trento +++ * via Sommarive, 18 +++ * I-38123 Trento (Italy) +++ * +++ * ------------------------------------------------------------------------- +++ * +++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++ * +++ * GNSS-SDR is a software defined Global Navigation +++ * Satellite Systems receiver +++ * +++ * This file is part of GNSS-SDR. +++ * +++ * GNSS-SDR is free software: you can redistribute it and/or modify +++ * it under the terms of the GNU General Public License as published by +++ * the Free Software Foundation, either version 3 of the License, or +++ * at your option) any later version. +++ * +++ * GNSS-SDR is distributed in the hope that it will be useful, +++ * but WITHOUT ANY WARRANTY; without even the implied warranty of +++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++ * GNU General Public License for more details. +++ * +++ * You should have received a copy of the GNU General Public License +++ * along with GNSS-SDR. If not, see . +++ * +++ * ------------------------------------------------------------------------- +++ */ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H +++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ +++ +++// float* pointer1 = (float*)&phase_rad_init; +++// *pointer1 = 0; +++// float* pointer2 = (float*)&phase_step_rad; +++// *pointer2 = 0.5; +++ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); +++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); +++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); +++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f); +++ __m128i _pi32avx_1 = _mm_set1_epi32(1); +++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1); +++ __m128i _pi32avx_2 = _mm_set1_epi32(2); +++ __m128i _pi32avx_4 = _mm_set1_epi32(4); +++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI +++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f); +++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f); +++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f); +++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f); +++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f); +++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); +++ __m256 _ps256_1 = _mm256_set1_ps(1.f); +++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); +++ +++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); +++ +++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; +++ __m256 xmm1, xmm2, xmm3, sign_bit_sin; +++ __m256i imm0, imm2, imm4; +++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; +++ __VOLK_ATTR_ALIGNED(32) float sin_value[8]; +++ __VOLK_ATTR_ALIGNED(32) float cos_value[8]; +++ +++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); +++ +++ for(int i = 0; i < sse_iters; i++) +++ { +++ +++ x = phase_rad_array; +++ +++ /* extract the sign bit (upper one) */ +++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); +++ +++ /* take the absolute value */ +++ x = _mm256_xor_ps(x, sign_bit_sin); +++ +++ /* scale by 4/Pi */ +++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI); +++ +++ /* we use SSE2 routines to perform the integer ops */ +++ +++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); +++ y = _mm256_cvttps_epi32(y); +++ imm2_1 = _mm256_extractf128_ps (y, 0); +++ imm2_2 = _mm256_extractf128_ps (y, 1); +++ +++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); +++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); +++ +++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); +++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); +++ +++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); +++ //_mm256_set_m128i not defined in some versions of immintrin.h +++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); +++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); +++ +++ y = _mm256_cvtepi32_ps(imm2); +++ +++ imm4_1 = imm2_1; +++ imm4_2 = imm2_2; +++ +++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); +++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); +++ +++ imm0_1 = _mm_slli_epi32(imm0_1, 29); +++ imm0_2 = _mm_slli_epi32(imm0_2, 29); +++ +++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); +++ //_mm256_set_m128i not defined in some versions of immintrin.h +++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1); +++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); +++ +++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); +++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); +++ +++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); +++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); +++ +++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); +++ //_mm256_set_m128i not defined in some versions of immintrin.h +++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); +++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); +++ +++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0); +++ poly_mask = _mm256_castsi256_ps(imm2); +++ +++ /* The magic pass: "Extended precision modular arithmetic" +++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ +++ xmm1 = _ps256_minus_cephes_DP1; +++ xmm2 = _ps256_minus_cephes_DP2; +++ xmm3 = _ps256_minus_cephes_DP3; +++ xmm1 = _mm256_mul_ps(y, xmm1); +++ xmm2 = _mm256_mul_ps(y, xmm2); +++ xmm3 = _mm256_mul_ps(y, xmm3); +++ x = _mm256_add_ps(x, xmm1); +++ x = _mm256_add_ps(x, xmm2); +++ x = _mm256_add_ps(x, xmm3); +++ +++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); +++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); +++ +++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); +++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); +++ +++ imm4_1 = _mm_slli_epi32(imm4_1, 29); +++ imm4_2 = _mm_slli_epi32(imm4_2, 29); +++ +++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); +++ //_mm256_set_m128i not defined in some versions of immintrin.h +++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1); +++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); +++ +++ sign_bit_cos = _mm256_castsi256_ps(imm4); +++ +++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); +++ +++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ +++ z = _mm256_mul_ps(x,x); +++ y = _ps256_coscof_p0; +++ +++ y = _mm256_mul_ps(y, z); +++ y = _mm256_add_ps(y, _ps256_coscof_p1); +++ y = _mm256_mul_ps(y, z); +++ y = _mm256_add_ps(y, _ps256_coscof_p2); +++ y = _mm256_mul_ps(y, z); +++ y = _mm256_mul_ps(y, z); +++ tmp = _mm256_mul_ps(z, _ps256_0p5); +++ y = _mm256_sub_ps(y, tmp); +++ y = _mm256_add_ps(y, _ps256_1); +++ +++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ +++ +++ y2 = _ps256_sincof_p0; +++ y2 = _mm256_mul_ps(y2, z); +++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1); +++ y2 = _mm256_mul_ps(y2, z); +++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2); +++ y2 = _mm256_mul_ps(y2, z); +++ y2 = _mm256_mul_ps(y2, x); +++ y2 = _mm256_add_ps(y2, x); +++ +++ /* select the correct result from the two polynoms */ +++ xmm3 = poly_mask; +++ ysin2 = _mm256_and_ps(xmm3, y2); +++ ysin1 = _mm256_andnot_ps(xmm3, y); +++ y2 = _mm256_sub_ps(y2,ysin2); +++ y = _mm256_sub_ps(y, ysin1); +++ +++ xmm1 = _mm256_add_ps(ysin1,ysin2); +++ xmm2 = _mm256_add_ps(y,y2); +++ +++ /* update the sign */ +++ s = _mm256_xor_ps(xmm1, sign_bit_sin); +++ c = _mm256_xor_ps(xmm2, sign_bit_cos); +++ +++ //GNSS-SDR needs to return -sin +++ s = _mm256_xor_ps(s, _ps256_sign_mask); +++ +++ _mm256_storeu_ps ((float*)sin_value, s); +++ _mm256_storeu_ps ((float*)cos_value, c); +++ +++ for(int i = 0; i < 8; i++) +++ { +++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); +++ } +++ d_carr_sign += 8; +++ +++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); +++ } +++ +++ if (num_points%8!=0) +++ { +++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; +++ _mm256_storeu_si256 ((float*)phase_rad_store, phase_rad_array); +++ +++ float phase_rad = phase_rad_store[0]; +++ +++ for(int i = 0; i < num_points%8; i++) +++ { +++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); +++ d_carr_sign++; +++ phase_rad += phase_step_rad; +++ } +++ } +++} +++#endif /* LV_HAVE_AVX */ +++ +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++*/ +++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ +++ +++// float* pointer1 = (float*)&phase_rad_init; +++// *pointer1 = 0; +++// float* pointer2 = (float*)&phase_step_rad; +++// *pointer2 = 0.5; +++ +++ const unsigned int sse_iters = num_points / 4; +++ +++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); +++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); +++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); +++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f); +++ __m128i _pi32_1 = _mm_set1_epi32(1); +++ __m128i _pi32_inv1 = _mm_set1_epi32(~1); +++ __m128i _pi32_2 = _mm_set1_epi32(2); +++ __m128i _pi32_4 = _mm_set1_epi32(4); +++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI +++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f); +++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f); +++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f); +++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f); +++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f); +++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); +++ __m128 _ps_1 = _mm_set1_ps(1.f); +++ __m128 _ps_0p5 = _mm_set1_ps(0.5f); +++ +++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); +++ +++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; +++ __m128 xmm1, xmm2, xmm3, sign_bit_sin; +++ __m128i emm0, emm2, emm4; +++ __VOLK_ATTR_ALIGNED(16) float sin_value[4]; +++ __VOLK_ATTR_ALIGNED(16) float cos_value[4]; +++ +++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); +++ +++ for(int i = 0; i < sse_iters; i++) +++ { +++ x = phase_rad_array; +++ +++ /* extract the sign bit (upper one) */ +++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); +++ +++ /* take the absolute value */ +++ x = _mm_xor_ps(x, sign_bit_sin); +++ +++ /* scale by 4/Pi */ +++ y = _mm_mul_ps(x, _ps_cephes_FOPI); +++ +++ /* store the integer part of y in emm2 */ +++ emm2 = _mm_cvttps_epi32(y); +++ +++ /* j=(j+1) & (~1) (see the cephes sources) */ +++ emm2 = _mm_add_epi32(emm2, _pi32_1); +++ emm2 = _mm_and_si128(emm2, _pi32_inv1); +++ y = _mm_cvtepi32_ps(emm2); +++ +++ emm4 = emm2; +++ +++ /* get the swap sign flag for the sine */ +++ emm0 = _mm_and_si128(emm2, _pi32_4); +++ emm0 = _mm_slli_epi32(emm0, 29); +++ swap_sign_bit_sin = _mm_castsi128_ps(emm0); +++ +++ /* get the polynom selection mask for the sine*/ +++ emm2 = _mm_and_si128(emm2, _pi32_2); +++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); +++ poly_mask = _mm_castsi128_ps(emm2); +++ +++ /* The magic pass: "Extended precision modular arithmetic" +++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ +++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); +++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); +++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); +++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); +++ +++ emm4 = _mm_sub_epi32(emm4, _pi32_2); +++ emm4 = _mm_andnot_si128(emm4, _pi32_4); +++ emm4 = _mm_slli_epi32(emm4, 29); +++ sign_bit_cos = _mm_castsi128_ps(emm4); +++ +++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); +++ +++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ +++ z = _mm_mul_ps(x,x); +++ y = _ps_coscof_p0; +++ y = _mm_mul_ps(y, z); +++ y = _mm_add_ps(y, _ps_coscof_p1); +++ y = _mm_mul_ps(y, z); +++ y = _mm_add_ps(y, _ps_coscof_p2); +++ y = _mm_mul_ps(y, _mm_mul_ps(z, z)); +++ tmp = _mm_mul_ps(z, _ps_0p5); +++ y = _mm_sub_ps(y, tmp); +++ y = _mm_add_ps(y, _ps_1); +++ +++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ +++ y2 = _ps_sincof_p0; +++ y2 = _mm_mul_ps(y2, z); +++ y2 = _mm_add_ps(y2, _ps_sincof_p1); +++ y2 = _mm_mul_ps(y2, z); +++ y2 = _mm_add_ps(y2, _ps_sincof_p2); +++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); +++ y2 = _mm_add_ps(y2, x); +++ +++ /* select the correct result from the two polynoms */ +++ xmm3 = poly_mask; +++ ysin2 = _mm_and_ps(xmm3, y2); +++ ysin1 = _mm_andnot_ps(xmm3, y); +++ y2 = _mm_sub_ps(y2,ysin2); +++ y = _mm_sub_ps(y, ysin1); +++ +++ xmm1 = _mm_add_ps(ysin1,ysin2); +++ xmm2 = _mm_add_ps(y,y2); +++ +++ /* update the sign */ +++ s = _mm_xor_ps(xmm1, sign_bit_sin); +++ c = _mm_xor_ps(xmm2, sign_bit_cos); +++ +++ //GNSS-SDR needs to return -sin +++ s = _mm_xor_ps(s, _ps_sign_mask); +++ +++ _mm_storeu_ps ((float*)sin_value, s); +++ _mm_storeu_ps ((float*)cos_value, c); +++ +++ for(int i = 0; i < 4; i++) +++ { +++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); +++ } +++ d_carr_sign += 4; +++ +++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); +++ } +++ +++ if (num_points%4!=0) +++ { +++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; +++ _mm_storeu_si128 ((__m128i*)phase_rad_store, phase_rad_array); +++ +++ float phase_rad = phase_rad_store[0]; +++ +++ for(int i = 0; i < num_points%4; i++) +++ { +++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); +++ d_carr_sign++; +++ phase_rad += phase_step_rad; +++ } +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++*/ +++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ +++ +++// float* pointer1 = (float*)&phase_rad_init; +++// *pointer1 = 0; +++// float* pointer2 = (float*)&phase_step_rad; +++// *pointer2 = 0.5; +++ +++ float phase_rad = phase_rad_init; +++ for(int i = 0; i < num_points; i++) +++ { +++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); +++ d_carr_sign++; +++ phase_rad += phase_step_rad; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */ +++ +++ +++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H +++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H +++ +++#include +++#include +++#include +++ +++#ifdef LV_HAVE_AVX +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ +++ +++ // float* pointer1 = (float*)&phase_rad_init; +++ // *pointer1 = 0; +++ // float* pointer2 = (float*)&phase_step_rad; +++ // *pointer2 = 0.5; +++ +++ const unsigned int sse_iters = num_points / 8; +++ +++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); +++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); +++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); +++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f); +++ __m128i _pi32avx_1 = _mm_set1_epi32(1); +++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1); +++ __m128i _pi32avx_2 = _mm_set1_epi32(2); +++ __m128i _pi32avx_4 = _mm_set1_epi32(4); +++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI +++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f); +++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f); +++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f); +++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f); +++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f); +++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); +++ __m256 _ps256_1 = _mm256_set1_ps(1.f); +++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); +++ +++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); +++ +++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; +++ __m256 xmm1, xmm2, xmm3, sign_bit_sin; +++ __m256i imm0, imm2, imm4; +++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; +++ __VOLK_ATTR_ALIGNED(32) float sin_value[8]; +++ __VOLK_ATTR_ALIGNED(32) float cos_value[8]; +++ +++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); +++ +++ for(int i = 0; i < sse_iters; i++) +++ { +++ +++ x = phase_rad_array; +++ +++ /* extract the sign bit (upper one) */ +++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); +++ +++ /* take the absolute value */ +++ x = _mm256_xor_ps(x, sign_bit_sin); +++ +++ /* scale by 4/Pi */ +++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI); +++ +++ /* we use SSE2 routines to perform the integer ops */ +++ +++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); +++ y = _mm256_cvttps_epi32(y); +++ imm2_1 = _mm256_extractf128_ps (y, 0); +++ imm2_2 = _mm256_extractf128_ps (y, 1); +++ +++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); +++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); +++ +++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); +++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); +++ +++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); +++ //_mm256_set_m128i not defined in some versions of immintrin.h +++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); +++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); +++ +++ y = _mm256_cvtepi32_ps(imm2); +++ +++ imm4_1 = imm2_1; +++ imm4_2 = imm2_2; +++ +++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); +++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); +++ +++ imm0_1 = _mm_slli_epi32(imm0_1, 29); +++ imm0_2 = _mm_slli_epi32(imm0_2, 29); +++ +++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); +++ //_mm256_set_m128i not defined in some versions of immintrin.h +++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1); +++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); +++ +++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); +++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); +++ +++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); +++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); +++ +++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); +++ //_mm256_set_m128i not defined in some versions of immintrin.h +++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); +++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); +++ +++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0); +++ poly_mask = _mm256_castsi256_ps(imm2); +++ +++ /* The magic pass: "Extended precision modular arithmetic" +++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ +++ xmm1 = _ps256_minus_cephes_DP1; +++ xmm2 = _ps256_minus_cephes_DP2; +++ xmm3 = _ps256_minus_cephes_DP3; +++ xmm1 = _mm256_mul_ps(y, xmm1); +++ xmm2 = _mm256_mul_ps(y, xmm2); +++ xmm3 = _mm256_mul_ps(y, xmm3); +++ x = _mm256_add_ps(x, xmm1); +++ x = _mm256_add_ps(x, xmm2); +++ x = _mm256_add_ps(x, xmm3); +++ +++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); +++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); +++ +++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); +++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); +++ +++ imm4_1 = _mm_slli_epi32(imm4_1, 29); +++ imm4_2 = _mm_slli_epi32(imm4_2, 29); +++ +++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); +++ //_mm256_set_m128i not defined in some versions of immintrin.h +++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1); +++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); +++ +++ sign_bit_cos = _mm256_castsi256_ps(imm4); +++ +++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); +++ +++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ +++ z = _mm256_mul_ps(x,x); +++ y = _ps256_coscof_p0; +++ +++ y = _mm256_mul_ps(y, z); +++ y = _mm256_add_ps(y, _ps256_coscof_p1); +++ y = _mm256_mul_ps(y, z); +++ y = _mm256_add_ps(y, _ps256_coscof_p2); +++ y = _mm256_mul_ps(y, z); +++ y = _mm256_mul_ps(y, z); +++ tmp = _mm256_mul_ps(z, _ps256_0p5); +++ y = _mm256_sub_ps(y, tmp); +++ y = _mm256_add_ps(y, _ps256_1); +++ +++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ +++ +++ y2 = _ps256_sincof_p0; +++ y2 = _mm256_mul_ps(y2, z); +++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1); +++ y2 = _mm256_mul_ps(y2, z); +++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2); +++ y2 = _mm256_mul_ps(y2, z); +++ y2 = _mm256_mul_ps(y2, x); +++ y2 = _mm256_add_ps(y2, x); +++ +++ /* select the correct result from the two polynoms */ +++ xmm3 = poly_mask; +++ ysin2 = _mm256_and_ps(xmm3, y2); +++ ysin1 = _mm256_andnot_ps(xmm3, y); +++ y2 = _mm256_sub_ps(y2,ysin2); +++ y = _mm256_sub_ps(y, ysin1); +++ +++ xmm1 = _mm256_add_ps(ysin1,ysin2); +++ xmm2 = _mm256_add_ps(y,y2); +++ +++ /* update the sign */ +++ s = _mm256_xor_ps(xmm1, sign_bit_sin); +++ c = _mm256_xor_ps(xmm2, sign_bit_cos); +++ +++ //GNSS-SDR needs to return -sin +++ s = _mm256_xor_ps(s, _ps256_sign_mask); +++ +++ _mm256_store_ps ((float*)sin_value, s); +++ _mm256_store_ps ((float*)cos_value, c); +++ +++ for(int i = 0; i < 8; i++) +++ { +++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); +++ } +++ d_carr_sign += 8; +++ +++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); +++ } +++ +++ if (num_points%8!=0) +++ { +++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; +++ _mm256_store_ps ((float*)phase_rad_store, phase_rad_array); +++ +++ float phase_rad = phase_rad_store[0]; +++ +++ for(int i = 0; i < num_points%8; i++) +++ { +++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); +++ d_carr_sign++; +++ phase_rad += phase_step_rad; +++ } +++ } +++} +++#endif /* LV_HAVE_AVX */ +++ +++#ifdef LV_HAVE_SSE2 +++#include +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ +++ +++// float* pointer1 = (float*)&phase_rad_init; +++// *pointer1 = 0; +++// float* pointer2 = (float*)&phase_step_rad; +++// *pointer2 = 0.5; +++ +++ const unsigned int sse_iters = num_points / 4; +++ +++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); +++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); +++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); +++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f); +++ __m128i _pi32_1 = _mm_set1_epi32(1); +++ __m128i _pi32_inv1 = _mm_set1_epi32(~1); +++ __m128i _pi32_2 = _mm_set1_epi32(2); +++ __m128i _pi32_4 = _mm_set1_epi32(4); +++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI +++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f); +++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f); +++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f); +++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f); +++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f); +++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); +++ __m128 _ps_1 = _mm_set1_ps(1.f); +++ __m128 _ps_0p5 = _mm_set1_ps(0.5f); +++ +++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); +++ +++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; +++ __m128 xmm1, xmm2, xmm3, sign_bit_sin; +++ __m128i emm0, emm2, emm4; +++ __VOLK_ATTR_ALIGNED(16) float sin_value[4]; +++ __VOLK_ATTR_ALIGNED(16) float cos_value[4]; +++ +++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); +++ +++ for(int i = 0; i < sse_iters; i++) +++ { +++ x = phase_rad_array; +++ +++ /* extract the sign bit (upper one) */ +++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); +++ +++ /* take the absolute value */ +++ x = _mm_xor_ps(x, sign_bit_sin); +++ +++ /* scale by 4/Pi */ +++ y = _mm_mul_ps(x, _ps_cephes_FOPI); +++ +++ /* store the integer part of y in emm2 */ +++ emm2 = _mm_cvttps_epi32(y); +++ +++ /* j=(j+1) & (~1) (see the cephes sources) */ +++ emm2 = _mm_add_epi32(emm2, _pi32_1); +++ emm2 = _mm_and_si128(emm2, _pi32_inv1); +++ y = _mm_cvtepi32_ps(emm2); +++ +++ emm4 = emm2; +++ +++ /* get the swap sign flag for the sine */ +++ emm0 = _mm_and_si128(emm2, _pi32_4); +++ emm0 = _mm_slli_epi32(emm0, 29); +++ swap_sign_bit_sin = _mm_castsi128_ps(emm0); +++ +++ /* get the polynom selection mask for the sine*/ +++ emm2 = _mm_and_si128(emm2, _pi32_2); +++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); +++ poly_mask = _mm_castsi128_ps(emm2); +++ +++ /* The magic pass: "Extended precision modular arithmetic" +++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ +++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); +++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); +++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); +++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); +++ +++ emm4 = _mm_sub_epi32(emm4, _pi32_2); +++ emm4 = _mm_andnot_si128(emm4, _pi32_4); +++ emm4 = _mm_slli_epi32(emm4, 29); +++ sign_bit_cos = _mm_castsi128_ps(emm4); +++ +++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); +++ +++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ +++ z = _mm_mul_ps(x,x); +++ y = _ps_coscof_p0; +++ y = _mm_mul_ps(y, z); +++ y = _mm_add_ps(y, _ps_coscof_p1); +++ y = _mm_mul_ps(y, z); +++ y = _mm_add_ps(y, _ps_coscof_p2); +++ y = _mm_mul_ps(y, _mm_mul_ps(z, z)); +++ tmp = _mm_mul_ps(z, _ps_0p5); +++ y = _mm_sub_ps(y, tmp); +++ y = _mm_add_ps(y, _ps_1); +++ +++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ +++ y2 = _ps_sincof_p0; +++ y2 = _mm_mul_ps(y2, z); +++ y2 = _mm_add_ps(y2, _ps_sincof_p1); +++ y2 = _mm_mul_ps(y2, z); +++ y2 = _mm_add_ps(y2, _ps_sincof_p2); +++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); +++ y2 = _mm_add_ps(y2, x); +++ +++ /* select the correct result from the two polynoms */ +++ xmm3 = poly_mask; +++ ysin2 = _mm_and_ps(xmm3, y2); +++ ysin1 = _mm_andnot_ps(xmm3, y); +++ y2 = _mm_sub_ps(y2,ysin2); +++ y = _mm_sub_ps(y, ysin1); +++ +++ xmm1 = _mm_add_ps(ysin1,ysin2); +++ xmm2 = _mm_add_ps(y,y2); +++ +++ /* update the sign */ +++ s = _mm_xor_ps(xmm1, sign_bit_sin); +++ c = _mm_xor_ps(xmm2, sign_bit_cos); +++ +++ //GNSS-SDR needs to return -sin +++ s = _mm_xor_ps(s, _ps_sign_mask); +++ +++ _mm_store_ps ((float*)sin_value, s); +++ _mm_store_ps ((float*)cos_value, c); +++ +++ for(int i = 0; i < 4; i++) +++ { +++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); +++ } +++ d_carr_sign += 4; +++ +++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); +++ } +++ +++ if (num_points%4!=0) +++ { +++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; +++ _mm_store_si128 ((__m128i*)phase_rad_store, phase_rad_array); +++ +++ float phase_rad = phase_rad_store[0]; +++ +++ for(int i = 0; i < num_points%4; i++) +++ { +++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); +++ d_carr_sign++; +++ phase_rad += phase_step_rad; +++ } +++ } +++} +++#endif /* LV_HAVE_SSE2 */ +++ +++#ifdef LV_HAVE_GENERIC +++/*! +++ \brief Accumulates the values in the input buffer +++ \param result The accumulated result +++ \param inputBuffer The buffer of data to be accumulated +++ \param num_points The number of values in inputBuffer to be accumulated +++ */ +++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ +++ +++// float* pointer1 = (float*)&phase_rad_init; +++// *pointer1 = 0; +++// float* pointer2 = (float*)&phase_step_rad; +++// *pointer2 = 0.5; +++ +++ float phase_rad = phase_rad_init; +++ for(int i = 0; i < num_points; i++) +++ { +++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); +++ d_carr_sign++; +++ phase_rad += phase_step_rad; +++ } +++} +++#endif /* LV_HAVE_GENERIC */ +++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */ +++ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt ++--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 04:26:38.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-17 04:17:37.000000000 +0200 ++@@ -517,7 +517,19 @@ if(MSVC) ++ endif() ++ ++ #create the volk_gnsssdr runtime library ++-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) +++ +++#MODIFICATIONS BY GNSS-SDR +++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) +++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt) +++ +++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) +++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc}) +++ +++source_group("Kernels" FILES ${h_files}) +++source_group("Common Macros" FILES ${CommonMacros}) +++source_group("ORC Files" FILES ${orc}) +++#END OF MODIFICATIONS +++ ++ target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries}) ++ set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER}) ++ set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS") ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc ++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 04:21:03.000000000 +0200 ++@@ -217,6 +217,72 @@ inline void run_cast_test3_s32fc(volk_gn ++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++ } ++ +++//ADDED BY GNSS-SDR. START +++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +++} +++ +++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { +++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); +++} +++//ADDED BY GNSS-SDR. END +++ ++ // This function is a nop that helps resolve GNU Radio bugs 582 and 583. ++ // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast(float tol) ++ // won't happen on armhf (reported on cortex A9 and A15). ++@@ -426,7 +492,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr ++ } else { ++ run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++- } else throw "unsupported 1 arg function >1 scalars"; +++ } +++ //ADDED BY GNSS-SDR. START +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ //ADDED BY GNSS-SDR. END +++ else throw "unsupported 1 arg function >1 scalars"; ++ break; ++ case 2: ++ if(inputsc.size() == 0) { ++@@ -437,7 +513,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr ++ } else { ++ run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++- } else throw "unsupported 2 arg function >1 scalars"; +++ } +++ //ADDED BY GNSS-SDR. START +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ //ADDED BY GNSS-SDR. END +++ else throw "unsupported 2 arg function >1 scalars"; ++ break; ++ case 3: ++ if(inputsc.size() == 0) { ++@@ -448,11 +534,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr ++ } else { ++ run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++ } ++- } else throw "unsupported 3 arg function >1 scalars"; +++ } +++ //ADDED BY GNSS-SDR. START +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ //ADDED BY GNSS-SDR. END +++ else throw "unsupported 3 arg function >1 scalars"; ++ break; ++ case 4: ++ run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++ break; +++ //ADDED BY GNSS-SDR. START +++ case 8: +++ if(inputsc.size() == 0) { +++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ else throw "unsupported 8 arg function >1 scalars"; +++ break; +++ case 12: +++ if(inputsc.size() == 0) { +++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { +++ if(inputsc[0].is_complex) { +++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); +++ } else { +++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++ } +++ else throw "unsupported 12 arg function >1 scalars"; +++ break; +++ //ADDED BY GNSS-SDR. END ++ default: ++ throw "no function handler for this signature"; ++ break; ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h ++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-17 04:21:51.000000000 +0200 ++@@ -77,4 +77,26 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f ++ typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); ++ typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++ +++//ADDED BY GNSS-SDR. START +++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input +++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input +++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); +++ +++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); +++ +++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); +++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); +++//ADDED BY GNSS-SDR. END +++ +++ ++ #endif //VOLK_QA_UTILS_H ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc ++--- /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc 2014-10-15 01:55:08.000000000 +0200 ++@@ -24,6 +24,58 @@ ++ #include ++ #include ++ +++//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1); +++ +++//GNSS-SDR PROTO-KERNELS +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1); +++ +++VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1); +++ +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1); +++ +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); +++ +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 20462, 1); +++ +++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 20462, 1); +++VOLK_RUN_TESTS(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 20462, 1); +++ +++ +++ +++ +++ +++ +++ ++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); ++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000); ++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_16i, 0, 0, 20462, 10000); ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,5 @@ +++.function volk_gnsssdr_32f_x2_add_32f_a_orc_impl +++.dest 4 dst +++.source 4 src1 +++.source 4 src2 +++addf dst, src1, src2 ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,18 @@ +++.function volk_gnsssdr_32fc_s32fc_multiply_32fc_a_orc_impl +++.source 8 src1 +++.floatparam 8 scalar +++.dest 8 dst +++.temp 8 iqprod +++.temp 4 real +++.temp 4 imag +++.temp 4 ac +++.temp 4 bd +++.temp 8 swapped +++x2 mulf iqprod, src1, scalar +++splitql bd, ac, iqprod +++subf real, ac, bd +++swaplq swapped, src1 +++x2 mulf iqprod, swapped, scalar +++splitql bd, ac, iqprod +++addf imag, ac, bd +++mergelq dst, real, imag ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,18 @@ +++.function volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl +++.source 8 src1 +++.source 8 src2 +++.dest 8 dst +++.temp 8 iqprod +++.temp 4 real +++.temp 4 imag +++.temp 4 ac +++.temp 4 bd +++.temp 8 swapped +++x2 mulf iqprod, src1, src2 +++splitql bd, ac, iqprod +++subf real, ac, bd +++swaplq swapped, src1 +++x2 mulf iqprod, swapped, src2 +++splitql bd, ac, iqprod +++addf imag, ac, bd +++mergelq dst, real, imag ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,40 @@ +++#/*! +++# * \file volk_gnsssdr_8i_accumulator_s8i.orc +++# * \brief ORC implementation: 8 bits (char) scalar accumulator +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that implements an accumulator of char values +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8i_accumulator_s8i_a_orc_impl +++.source 1 src1 +++.accumulator 2 acc +++.temp 2 sum +++mergebw sum, 0, src1 +++accw acc, sum ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,39 @@ +++#/*! +++# * \file volk_gnsssdr_8i_x2_add_8i.orc +++# * \brief ORC implementation: adds pairs of 8 bits (char) scalars +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that adds pairs of 8 bits (char) scalars +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8i_x2_add_8i_a_orc_impl +++.dest 1 dst +++.source 1 src1 +++.source 1 src2 +++addb dst, src1, src2 ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,42 @@ +++#/*! +++# * \file volk_gnsssdr_8ic_conjugate_8ic.orc +++# * \brief ORC implementation: calculates the conjugate of a 16 bits vector +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that calculates the conjugate of a +++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part) +++# * result = (real*real) + (imag*imag) +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl +++.source 2 src1 +++.dest 2 dst +++.temp 2 merged +++mergebw merged, 1, -1 +++x2 mullb dst, merged, src1 ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,45 @@ +++#/*! +++# * \file volk_gnsssdr_8ic_magnitude_squared_8i.orc +++# * \brief ORC implementation: calculates the magnitude squared of a 16 bits vector +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that calculates the magnitude squared of a +++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part) +++# * result = (real*real) + (imag*imag) +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl +++.source 2 src1 +++.dest 1 dst +++.temp 2 iqprod +++.temp 1 ac +++.temp 1 bd +++x2 mullb iqprod, src1, src1 +++splitwb bd, ac, iqprod +++addb dst, ac, bd ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,58 @@ +++#/*! +++# * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.orc +++# * \brief ORC implementation: multiplies a group of 16 bits vectors by one constant vector +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that multiplies a group of 16 bits vectors +++# * (8 bits the real part and 8 bits the imaginary part) by one constant vector +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl +++.source 2 src1 +++.param 2 src2real +++.param 2 src2imag +++.dest 2 dst +++.temp 2 iqprod +++.temp 1 real +++.temp 1 imag +++.temp 1 rr +++.temp 1 ii +++.temp 1 ri +++.temp 1 ir +++x2 mullb iqprod, src1, src2real +++splitwb ir, rr, iqprod +++x2 mullb iqprod, src1, src2imag +++splitwb ii, ri, iqprod +++subb real, rr, ii +++addb imag, ri, ir +++mergebw dst, real, imag +++ +++ +++ +++ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,59 @@ +++#/*! +++# * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.orc +++# * \brief ORC implementation: multiplies two 16 bits vectors and accumulates them +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that multiplies two 16 bits vectors (8 bits the real part +++# * and 8 bits the imaginary part) and accumulates them +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl +++.source 2 src1 +++.source 2 src2 +++.accumulator 2 accreal +++.accumulator 2 accimag +++.temp 2 iqprod +++.temp 1 real +++.temp 1 imag +++.temp 2 real2 +++.temp 2 imag2 +++.temp 1 ac +++.temp 1 bd +++.temp 2 swapped +++x2 mullb iqprod, src1, src2 +++splitwb bd, ac, iqprod +++subb real, ac, bd +++swapw swapped, src1 +++x2 mullb iqprod, swapped, src2 +++splitwb bd, ac, iqprod +++addb imag, ac, bd +++mergebw real2, 0, real +++accw accreal, real2 +++mergebw imag2, 0, imag +++accw accimag, imag2 ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,57 @@ +++#/*! +++# * \file volk_gnsssdr_8ic_x2_multiply_8ic.orc +++# * \brief ORC implementation: multiplies two 16 bits vectors +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that multiplies two 16 bits vectors (8 bits the real part +++# * and 8 bits the imaginary part) +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl +++.source 2 src1 +++.source 2 src2 +++.dest 2 dst +++.temp 2 iqprod +++.temp 1 real +++.temp 1 imag +++.temp 1 ac +++.temp 1 bd +++.temp 2 swapped +++x2 mullb iqprod, src1, src2 +++splitwb bd, ac, iqprod +++subb real, ac, bd +++swapw swapped, src1 +++x2 mullb iqprod, swapped, src2 +++splitwb bd, ac, iqprod +++addb imag, ac, bd +++mergebw dst, real, imag +++ +++ +++ +++ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,139 @@ +++#/*! +++# * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc +++# * \brief ORC implementation: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that performs the carrier wipe-off mixing and the +++# * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the +++# * real part and 8 bits the imaginary part): +++# * - The carrier wipe-off is done by multiplying the input signal by the +++# * carrier (multiplication of 16 bits vectors) It returns the input +++# * signal in base band (BB) +++# * - Early values are calculated by multiplying the input signal in BB by the +++# * early code (multiplication of 16 bits vectors), accumulating the results +++# * - Prompt values are calculated by multiplying the input signal in BB by the +++# * prompt code (multiplication of 16 bits vectors), accumulating the results +++# * - Late values are calculated by multiplying the input signal in BB by the +++# * late code (multiplication of 16 bits vectors), accumulating the results +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl +++.source 2 input +++.source 2 carrier +++.source 2 E_code +++.source 2 P_code +++.accumulator 2 E_out_real +++.accumulator 2 E_out_imag +++.accumulator 2 P_out_real +++.accumulator 2 P_out_imag +++.temp 2 bb_signal_sample +++.temp 2 iqprod +++.temp 1 real +++.temp 1 imag +++.temp 1 ac +++.temp 1 bd +++.temp 2 swapped +++ +++.temp 2 real2 +++.temp 2 imag2 +++ +++x2 mullb iqprod, input, carrier +++splitwb bd, ac, iqprod +++subb real, ac, bd +++swapw swapped, input +++x2 mullb iqprod, swapped, carrier +++splitwb bd, ac, iqprod +++addb imag, ac, bd +++mergebw bb_signal_sample, real, imag +++ +++swapw swapped, bb_signal_sample +++ +++x2 mullb iqprod, bb_signal_sample, E_code +++splitwb bd, ac, iqprod +++subb real, ac, bd +++x2 mullb iqprod, swapped, E_code +++splitwb bd, ac, iqprod +++addb imag, ac, bd +++mergebw real2, 0, real +++mergebw imag2, 0, imag +++accw E_out_real, real2 +++accw E_out_imag, imag2 +++ +++x2 mullb iqprod, bb_signal_sample, P_code +++splitwb bd, ac, iqprod +++subb real, ac, bd +++x2 mullb iqprod, swapped, P_code +++splitwb bd, ac, iqprod +++addb imag, ac, bd +++mergebw real2, 0, real +++mergebw imag2, 0, imag +++accw P_out_real, real2 +++accw P_out_imag, imag2 +++ +++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl +++.source 2 input +++.source 2 carrier +++.source 2 L_code +++.accumulator 2 L_out_real +++.accumulator 2 L_out_imag +++ +++.temp 2 bb_signal_sample +++.temp 2 iqprod +++.temp 1 real +++.temp 1 imag +++.temp 1 ac +++.temp 1 bd +++.temp 2 swapped +++ +++.temp 2 real2 +++.temp 2 imag2 +++ +++x2 mullb iqprod, input, carrier +++splitwb bd, ac, iqprod +++subb real, ac, bd +++swapw swapped, input +++x2 mullb iqprod, swapped, carrier +++splitwb bd, ac, iqprod +++addb imag, ac, bd +++mergebw bb_signal_sample, real, imag +++ +++swapw swapped, bb_signal_sample +++ +++x2 mullb iqprod, bb_signal_sample, L_code +++splitwb bd, ac, iqprod +++subb real, ac, bd +++x2 mullb iqprod, swapped, L_code +++splitwb bd, ac, iqprod +++addb imag, ac, bd +++mergebw real2, 0, real +++mergebw imag2, 0, imag +++accw L_out_real, real2 +++accw L_out_imag, imag2 +++ +++ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc ++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 2014-10-15 01:55:08.000000000 +0200 ++@@ -0,0 +1,39 @@ +++#/*! +++# * \file volk_gnsssdr_8u_x2_multiply_8u.orc +++# * \brief ORC implementation: multiplies unsigned char values +++# * \authors
    +++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com +++# *
+++# * +++# * ORC code that multiplies unsigned char values (8 bits data) +++# * +++# * ------------------------------------------------------------------------- +++# * +++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) +++# * +++# * GNSS-SDR is a software defined Global Navigation +++# * Satellite Systems receiver +++# * +++# * This file is part of GNSS-SDR. +++# * +++# * GNSS-SDR is free software: you can redistribute it and/or modify +++# * it under the terms of the GNU General Public License as published by +++# * the Free Software Foundation, either version 3 of the License, or +++# * at your option) any later version. +++# * +++# * GNSS-SDR is distributed in the hope that it will be useful, +++# * but WITHOUT ANY WARRANTY; without even the implied warranty of +++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +++# * GNU General Public License for more details. +++# * +++# * You should have received a copy of the GNU General Public License +++# * along with GNSS-SDR. If not, see . +++# * +++# * ------------------------------------------------------------------------- +++# */ +++ +++.function volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl +++.source 1 src1 +++.source 1 src2 +++.dest 1 dst +++mullb dst, src1, src2 ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch.patch ++--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch.patch 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch.patch 2014-10-17 03:39:01.000000000 +0200 ++@@ -0,0 +1,471 @@ +++Binary files /Users/andres/Desktop/volk_gnsssdr/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/.DS_Store differ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt +++--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-15 01:55:08.000000000 +0200 +++@@ -406,8 +406,10 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8 +++ # if we find one that matches our current system architecture +++ # set up the assembler flags and include the source files +++ foreach(ARCH ${ASM_ARCHS_AVAILABLE}) ++++ message(STATUS "--==>> -CFLAGS1: ${FULL_C_FLAGS}") +++ string(REGEX MATCH "${ARCH}" ASM_ARCH "${FULL_C_FLAGS}") +++ if( ASM_ARCH STREQUAL "armv7" ) ++++ set(ASM-ATT $ENV{ASM}) +++ message(STATUS "---- Adding ASM files") # we always use ATT syntax +++ message(STATUS "-- Detected armv7 architecture; enabling ASM") +++ # setup architecture specific assembler flags +++@@ -420,20 +422,13 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8 +++ message(STATUS "Adding source file: ${asm_file}") +++ endforeach(asm_file) +++ endif() +++- enable_language(ASM) +++- set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS}) +++- message(STATUS "c flags: ${FULL_C_FLAGS}") +++- message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}") ++++ set(CMAKE_ASM-ATT_FLAGS_INIT ${ARCH_ASM_FLAGS}) ++++ enable_language(ASM-ATT) # this must be after flags_init ++++ message(STATUS "asm flags: ${CMAKE_ASM-ATT_FLAGS}") +++ endforeach(ARCH) +++ +++ else(${CMAKE_VERSION} VERSION_GREATER "2.8.9") +++ message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.") +++- foreach(machine_name ${available_machines}) +++- string(REGEX MATCH "neon" NEON_MACHINE ${machine_name}) +++- if( NEON_MACHINE STREQUAL "neon") +++- message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support") +++- endif() +++- endforeach() +++ endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9") +++ +++ ######################################################################## +++@@ -517,11 +512,24 @@ if(MSVC) +++ endif() +++ +++ #create the volk_gnsssdr runtime library +++-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) ++++ ++++#MODIFICATIONS BY GNSS-SDR ++++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) ++++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt) ++++ ++++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) ++++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc}) ++++ ++++source_group("Kernels" FILES ${h_files}) ++++source_group("Common Macros" FILES ${CommonMacros}) ++++source_group("ORC Files" FILES ${orc}) ++++#END OF MODIFICATIONS ++++ +++ target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries}) +++ set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER}) +++ set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS") +++ ++++ +++ install(TARGETS volk_gnsssdr +++ LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_runtime" # .so file +++ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_devel" # .lib file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc +++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 01:54:35.000000000 +0200 +++@@ -5,9 +5,7 @@ +++ #include +++ #include +++ #include +++-#include +++ #include +++-#include +++ #include +++ #include +++ #include +++@@ -217,6 +215,72 @@ inline void run_cast_test3_s32fc(volk_gn +++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +++ } +++ ++++//ADDED BY GNSS-SDR. START ++++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++++} ++++//ADDED BY GNSS-SDR. END ++++ +++ // This function is a nop that helps resolve GNU Radio bugs 582 and 583. +++ // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast(float tol) +++ // won't happen on armhf (reported on cortex A9 and A15). +++@@ -330,9 +394,9 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ lv_32fc_t scalar, +++ int vlen, +++ int iter, +++- std::vector *results, +++- std::string puppet_master_name, +++- bool benchmark_mode, ++++ std::vector *best_arch_vector = 0, ++++ std::string puppet_master_name = "NULL", ++++ bool benchmark_mode, +++ std::string kernel_regex +++ ) { +++ boost::xpressive::sregex kernel_expression = boost::xpressive::sregex::compile(kernel_regex); +++@@ -340,12 +404,6 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ // in this case we have a regex and are only looking to test one kernel +++ return false; +++ } +++- if(results) { +++- results->push_back(volk_gnsssdr_test_results_t()); +++- results->back().name = name; +++- results->back().vlen = vlen; +++- results->back().iter = iter; +++- } +++ std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl; +++ +++ // The multiply and lv_force_cast_hf are work arounds for GNU Radio bugs 582 and 583 +++@@ -426,7 +484,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ } else { +++ run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++- } else throw "unsupported 1 arg function >1 scalars"; ++++ } ++++ //ADDED BY GNSS-SDR. START ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ //ADDED BY GNSS-SDR. END ++++ else throw "unsupported 1 arg function >1 scalars"; +++ break; +++ case 2: +++ if(inputsc.size() == 0) { +++@@ -437,7 +505,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ } else { +++ run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++- } else throw "unsupported 2 arg function >1 scalars"; ++++ } ++++ //ADDED BY GNSS-SDR. START ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ //ADDED BY GNSS-SDR. END ++++ else throw "unsupported 2 arg function >1 scalars"; +++ break; +++ case 3: +++ if(inputsc.size() == 0) { +++@@ -448,11 +526,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ } else { +++ run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++- } else throw "unsupported 3 arg function >1 scalars"; ++++ } ++++ //ADDED BY GNSS-SDR. START ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ //ADDED BY GNSS-SDR. END ++++ else throw "unsupported 3 arg function >1 scalars"; +++ break; +++ case 4: +++ run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ break; ++++ //ADDED BY GNSS-SDR. START ++++ case 8: ++++ if(inputsc.size() == 0) { ++++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ else throw "unsupported 8 arg function >1 scalars"; ++++ break; ++++ case 12: ++++ if(inputsc.size() == 0) { ++++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ else throw "unsupported 12 arg function >1 scalars"; ++++ break; ++++ //ADDED BY GNSS-SDR. END +++ default: +++ throw "no function handler for this signature"; +++ break; +++@@ -461,13 +589,6 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ end = clock(); +++ double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC; +++ std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl; +++- if(results) { +++- volk_gnsssdr_test_time_t result; +++- result.name = arch_list[i]; +++- result.time = arch_time; +++- result.units = "ms"; +++- results->back().results[result.name] = result; +++- } +++ +++ profile_times.push_back(arch_time); +++ } +++@@ -568,14 +689,13 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ +++ std::cout << "Best aligned arch: " << best_arch_a << std::endl; +++ std::cout << "Best unaligned arch: " << best_arch_u << std::endl; +++- if(results) { ++++ if(best_arch_vector) { +++ if(puppet_master_name == "NULL") { +++- results->back().config_name = name; +++- } else { +++- results->back().config_name = puppet_master_name; ++++ best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u); ++++ } ++++ else { ++++ best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u); +++ } +++- results->back().best_arch_a = best_arch_a; +++- results->back().best_arch_u = best_arch_u; +++ } +++ +++ return fail_global; +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h +++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -3,10 +3,7 @@ +++ +++ #include +++ #include +++-#include +++-#include +++ #include +++-#include +++ #include +++ #include +++ +++@@ -24,46 +21,10 @@ volk_gnsssdr_type_t volk_gnsssdr_type_fr +++ float uniform(void); +++ void random_floats(float *buf, unsigned n); +++ +++-class volk_gnsssdr_test_time_t { +++- public: +++- std::string name; +++- double time; +++- std::string units; +++-}; ++++bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector *, std::string, bool benchmark_mode=false, std::string kernel_regex=""); +++ +++-class volk_gnsssdr_test_results_t { +++- public: +++- std::string name; +++- std::string config_name; +++- int vlen; +++- int iter; +++- std::map results; +++- std::string best_arch_a; +++- std::string best_arch_u; +++-}; +++ +++-bool run_volk_gnsssdr_tests( +++- volk_gnsssdr_func_desc_t, +++- void(*)(), +++- std::string, +++- float, +++- lv_32fc_t, +++- int, +++- int, +++- std::vector *results = NULL, +++- std::string puppet_master_name = "NULL", +++- bool benchmark_mode=false, +++- std::string kernel_regex="" +++- ); +++- +++- +++-#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ +++- BOOST_AUTO_TEST_CASE(func##_test) { \ +++- BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ +++- func##_get_func_desc(), (void (*)())func##_manual, \ +++- std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ +++- 0); \ +++- } ++++#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); } +++ #define VOLK_PROFILE(func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL", bnmode, kernel_regex) +++ #define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func), bnmode, kernel_regex) +++ typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place +++@@ -77,4 +38,25 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f +++ typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); +++ typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); +++ ++++//ADDED BY GNSS-SDR. START ++++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input ++++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input ++++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++++ ++++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++++ ++++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++++//ADDED BY GNSS-SDR. END ++++ +++ #endif //VOLK_QA_UTILS_H +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_VOLK_RUNTIME +++-#define INCLUDED_VOLK_RUNTIME ++++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME ++++#define INCLUDED_VOLK_GNSSSDR_RUNTIME +++ +++ #include +++ #include +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H +++-#define INCLUDED_VOLK_CONFIG_FIXED_H ++++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H ++++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H +++ +++ #for $i, $arch in enumerate($archs) +++ #define LV_$(arch.name.upper()) $i +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_VOLK_CPU_H +++-#define INCLUDED_VOLK_CPU_H ++++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H ++++#define INCLUDED_VOLK_GNSSSDR_CPU_H +++ +++ #include +++ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_LIBVOLK_MACHINES_H +++-#define INCLUDED_LIBVOLK_MACHINES_H ++++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H ++++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +++ +++ #include +++ #include +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_VOLK_TYPEDEFS +++-#define INCLUDED_VOLK_TYPEDEFS ++++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS ++++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS +++ +++ #include +++ #include ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch ++--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/2014-10-17_Patch_with_protokernels.patch 2014-10-17 03:35:38.000000000 +0200 ++@@ -0,0 +1,19299 @@ +++Binary files /Users/andres/Desktop/volk_gnsssdr/.DS_Store and /Users/andres/Desktop/volk_gnsssdr_original/.DS_Store differ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc +++--- /Users/andres/Desktop/volk_gnsssdr/apps/volk_gnsssdr_profile.cc 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/apps/volk_gnsssdr_profile.cc 2014-10-17 01:45:18.000000000 +0200 +++@@ -37,49 +37,6 @@ +++ +++ namespace fs = boost::filesystem; +++ +++-void write_json(std::ofstream &json_file, std::vector results) { +++- json_file << "{" << std::endl; +++- json_file << " \"volk_gnsssdr_tests\": [" << std::endl; +++- size_t len = results.size(); +++- size_t i = 0; +++- BOOST_FOREACH(volk_gnsssdr_test_results_t &result, results) { +++- json_file << " {" << std::endl; +++- json_file << " \"name\": \"" << result.name << "\"," << std::endl; +++- json_file << " \"vlen\": " << result.vlen << "," << std::endl; +++- json_file << " \"iter\": " << result.iter << "," << std::endl; +++- json_file << " \"best_arch_a\": \"" << result.best_arch_a +++- << "\"," << std::endl; +++- json_file << " \"best_arch_u\": \"" << result.best_arch_u +++- << "\"," << std::endl; +++- json_file << " \"results\": {" << std::endl; +++- size_t results_len = result.results.size(); +++- size_t ri = 0; +++- typedef std::pair tpair; +++- BOOST_FOREACH(tpair pair, result.results) { +++- volk_gnsssdr_test_time_t time = pair.second; +++- json_file << " \"" << time.name << "\": {" << std::endl; +++- json_file << " \"name\": \"" << time.name << "\"," << std::endl; +++- json_file << " \"time\": " << time.time << "," << std::endl; +++- json_file << " \"units\": \"" << time.units << "\"" << std::endl; +++- json_file << " }" ; +++- if(ri+1 != results_len) { +++- json_file << ","; +++- } +++- json_file << std::endl; +++- ri++; +++- } +++- json_file << " }" << std::endl; +++- json_file << " }"; +++- if(i+1 != len) { +++- json_file << ","; +++- } +++- json_file << std::endl; +++- i++; +++- } +++- json_file << " ]" << std::endl; +++- json_file << "}" << std::endl; +++-} +++- +++ int main(int argc, char *argv[]) { +++ // Adding program options +++ boost::program_options::options_description desc("Options"); +++@@ -92,9 +49,6 @@ int main(int argc, char *argv[]) { +++ ("tests-regex,R", +++ boost::program_options::value(), +++ "Run tests matching regular expression.") +++- ("json,j", +++- boost::program_options::value(), +++- "JSON output file") +++ ; +++ +++ // Handle the options that were given +++@@ -102,8 +56,6 @@ int main(int argc, char *argv[]) { +++ bool benchmark_mode; +++ std::string kernel_regex; +++ bool store_results = true; +++- std::ofstream json_file; +++- +++ try { +++ boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm); +++ boost::program_options::notify(vm); +++@@ -131,14 +83,9 @@ int main(int argc, char *argv[]) { +++ return 0; +++ } +++ +++- if ( vm.count("json") ) +++- { +++- json_file.open( vm["json"].as().c_str() ); +++- } +++- +++ +++ // Run tests +++- std::vector results; ++++ std::vector results; +++ +++ //VOLK_PROFILE(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); +++ //VOLK_PROFILE(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex); +++@@ -155,6 +102,55 @@ int main(int argc, char *argv[]) { +++ +++ // Until we can update the config on a kernel by kernel basis +++ // do not overwrite volk_gnsssdr_config when using a regex. ++++ ++++ //GNSS-SDR PROTO-KERNELS ++++ //lv_32fc_t sfv = lv_cmake((float)1, (float)2); ++++ //example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ ++++ //CAN NOT BE TESTED YET BECAUSE VOLK MODULE DOES NOT SUPPORT IT: ++++ //VOLK_PROFILE(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 16007, 1, &results, benchmark_mode, kernel_regex); ++++ //VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 7, 1, &results, benchmark_mode, kernel_regex); ++++ ++++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ ++++ VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ ++++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ ++++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex); ++++ ++++ /*VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex); ++++ VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);*/ ++++ +++ if(store_results) { +++ char path[1024]; +++ volk_gnsssdr_get_config_path(path); +++@@ -178,10 +174,8 @@ int main(int argc, char *argv[]) { +++ #the function name is followed by the preferred architecture.\n\ +++ "; +++ +++- BOOST_FOREACH(volk_gnsssdr_test_results_t result, results) { +++- config << result.config_name << " " +++- << result.best_arch_a << " " +++- << result.best_arch_u << std::endl; ++++ BOOST_FOREACH(std::string result, results) { ++++ config << result << std::endl; +++ } +++ config.close(); +++ } +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,174 @@ ++++/*! ++++ * \file CommonMacros.h ++++ * \brief Common macros used inside the volk protokernels. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++#ifndef INCLUDED_gnsssdr_CommonMacros_u_H ++++#define INCLUDED_gnsssdr_CommonMacros_u_H ++++ ++++ #ifdef LV_HAVE_SSE4_1 ++++ /*! ++++ \brief Macros for U_SSE4_1 ++++ */ ++++ ++++ #ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 ++++ #define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\ ++++ imag = _mm_srli_si128 (input1, 2);\ ++++ imag = _mm_blend_epi16 (input2, imag, 85);\ ++++ real = _mm_slli_si128 (input2, 2);\ ++++ real = _mm_blend_epi16 (real, input1, 85); ++++ #endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */ ++++ ++++ #ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 ++++ #define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\ ++++ input_i_1 = _mm_cvtepi16_epi32(input);\ ++++ input = _mm_srli_si128 (input, 8);\ ++++ input_i_2 = _mm_cvtepi16_epi32(input);\ ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\ ++++ output_ps = _mm_cvtepi32_ps(output_i32); ++++ #endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ ++++ ++++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 ++++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\ ++++ input_i_1 = _mm_cvtepi8_epi32(input);\ ++++ input = _mm_srli_si128 (input, 4);\ ++++ input_i_2 = _mm_cvtepi8_epi32(input);\ ++++ input = _mm_srli_si128 (input, 4);\ ++++ output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\ ++++ input_i_1 = _mm_cvtepi8_epi32(input);\ ++++ input = _mm_srli_si128 (input, 4);\ ++++ input_i_2 = _mm_cvtepi8_epi32(input);\ ++++ input = _mm_srli_si128 (input, 4);\ ++++ output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\ ++++ output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\ ++++ output_ps = _mm_cvtepi32_ps(output_i32); ++++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */ ++++ ++++ #endif /* LV_HAVE_SSE4_1 */ ++++ ++++ #ifdef LV_HAVE_SSE2 ++++ /*! ++++ \brief Macros for U_SSE2 ++++ */ ++++ ++++ #ifdef LV_HAVE_SSSE3 ++++ /*! ++++ \brief Macros for U_SSSE3 ++++ */ ++++ ++++ #ifndef CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 ++++ #define CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)\ ++++ y_aux = _mm_sign_epi8 (y, x);\ ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);\ ++++ real_output = _mm_maddubs_epi16 (x_abs, y_aux);\ ++++ \ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);\ ++++ y_aux = _mm_sign_epi8 (y_aux, x);\ ++++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); ++++ #endif /* CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 */ ++++ ++++ #endif /* LV_HAVE_SSSE3 */ ++++ ++++ #ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 ++++ #define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy);\ ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\ ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\ ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\ ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ #endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */ ++++ ++++ #ifndef CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 ++++ #define CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(input, mult1, real, imag)\ ++++ imag = _mm_srli_si128 (input, 1);\ ++++ imag = _mm_and_si128 (imag, mult1);\ ++++ real = _mm_and_si128 (input, mult1); ++++ #endif /* CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 */ ++++ ++++ #ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 ++++ #define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(input, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\ ++++ input_i_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), input);\ ++++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\ ++++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\ ++++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\ ++++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\ ++++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\ ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32);\ ++++ \ ++++ input_i_1 = _mm_unpackhi_epi8(_mm_setzero_si128(), input);\ ++++ input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\ ++++ input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\ ++++ input_i_1 = _mm_srai_epi32(input_i_1, 24);\ ++++ input_i_2 = _mm_srai_epi32(input_i_2, 24);\ ++++ output_i32 = _mm_add_epi32(input_i_1, input_i_2);\ ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ #endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 */ ++++ ++++ #ifndef CM_8IC_CONTROLMINUS128_8IC_U_SSE2 ++++ #define CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\ ++++ minus128control = _mm_cmpeq_epi8 (y, minus128);\ ++++ y = _mm_sub_epi8 (y, minus128control); ++++ #endif /* CM_8IC_CONTROLMINUS128_8IC_U_SSE2 */ ++++ ++++ #endif /* LV_HAVE_SSE2 */ ++++ ++++ #ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Macros for U_GENERIC ++++ */ ++++ ++++ #endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_CommonMacros_a_H ++++#define INCLUDED_gnsssdr_CommonMacros_a_H ++++ ++++ #ifdef LV_HAVE_SSE4_1 ++++ /*! ++++ \brief Macros for A_SSE4_1 ++++ */ ++++ ++++ #endif /* LV_HAVE_SSE4_1 */ ++++ ++++ #ifdef LV_HAVE_SSE2 ++++ /*! ++++ \brief Macros for U_SSE2 ++++ */ ++++ ++++ #endif /* LV_HAVE_SSE2 */ ++++ ++++ #ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Macros for A_GENERIC ++++ */ ++++ ++++ #endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,76 @@ ++++/*! ++++ * \file CommonMacros_16ic_cw_corr_32fc.h ++++ * \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H ++++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H ++++#include "CommonMacros/CommonMacros.h" ++++ ++++ #ifdef LV_HAVE_SSE4_1 ++++ /*! ++++ \brief Macros for U_SSE4_1 ++++ */ ++++ ++++ #ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 ++++ #define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ ++++ ++++ #endif /* LV_HAVE_SSE4_1 */ ++++ ++++ #ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Macros for U_GENERIC ++++ */ ++++ ++++ #endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H ++++#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H ++++ ++++ #ifdef LV_HAVE_SSE4_1 ++++ /*! ++++ \brief Macros for A_SSE4_1 ++++ */ ++++ ++++ #endif /* LV_HAVE_SSE4_1 */ ++++ ++++ #ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Macros for A_GENERIC ++++ */ ++++ ++++ #endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,114 @@ ++++/*! ++++ * \file CommonMacros_8ic_cw_corr_32fc.h ++++ * \brief Common macros used inside the 8ic_cw_corr_32fc volk protokernels. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H ++++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H ++++#include "CommonMacros/CommonMacros.h" ++++ ++++ #ifdef LV_HAVE_SSE4_1 ++++ /*! ++++ \brief Macros for U_SSE4_1 ++++ */ ++++ ++++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1 ++++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ ++++ \ ++++ imag_output = _mm_slli_si128 (imag_output, 1);\ ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1);\ ++++ \ ++++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ #endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */ ++++ ++++ #ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 ++++ #define CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ ++++ CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\ ++++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++++ #endif /* CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 */ ++++ ++++ #ifndef CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 ++++ #define CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\ ++++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++++ #endif /* CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 */ ++++ ++++ #endif /* LV_HAVE_SSE4_1 */ ++++ ++++ #ifdef LV_HAVE_SSE2 ++++ /*! ++++ \brief Macros for U_SSE2 ++++ */ ++++ ++++ #ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 ++++ #define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\ ++++ \ ++++ real_output = _mm_and_si128 (real_output, mult1);\ ++++ imag_output = _mm_and_si128 (imag_output, mult1);\ ++++ imag_output = _mm_slli_si128 (imag_output, 1);\ ++++ output = _mm_or_si128 (real_output, imag_output);\ ++++ \ ++++ CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ #endif /* CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 */ ++++ ++++ #endif /* LV_HAVE_SSE2 */ ++++ ++++ #ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Macros for U_GENERIC ++++ */ ++++ ++++ #endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H ++++#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H ++++ ++++ #ifdef LV_HAVE_SSE4_1 ++++ /*! ++++ \brief Macros for A_SSE4_1 ++++ */ ++++ ++++ #endif /* LV_HAVE_SSE4_1 */ ++++ ++++ #ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Macros for A_GENERIC ++++ */ ++++ ++++ #endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/CommonMacros/README.txt 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/CommonMacros/README.txt 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,34 @@ ++++#################################################################### ++++Common Macros inside volk_gnsssdr module ++++#################################################################### ++++ ++++First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties. ++++Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples. ++++ ++++#################################################################### ++++Syntax ++++#################################################################### ++++ ++++In order to allow better understanding of the code I created the macros with an specific syntax. ++++ ++++1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example: ++++ ++++example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) ++++ ++++First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output). ++++The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile. ++++ ++++2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h ++++ ++++#################################################################### ++++Workflow ++++#################################################################### ++++ ++++In order to use the macros easily, I usually test the code without macros inside a testing proto-kernel, where you are able to test it, debug it and use breakpoints. ++++When it works I place code inside a macro an I test it again. ++++ ++++#################################################################### ++++Why macros ++++#################################################################### ++++1) They are the only way I could find for sharing code between proto-kernels without performance penalty. ++++2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it. +++\ No newline at end of file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16i_s32f_convert_32f.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,241 @@ ++++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H ++++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H ++++ ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include ++++ ++++ /*! ++++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++++ \param inputVector The 16 bit input data buffer ++++ \param outputVector The floating point output data buffer ++++ \param scalar The value divided against each point in the output buffer ++++ \param num_points The number of data values to be converted ++++ \note Output buffer does NOT need to be properly aligned ++++ */ ++++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int eighthPoints = num_points / 8; ++++ ++++ float* outputVectorPtr = outputVector; ++++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++++ int16_t* inputPtr = (int16_t*)inputVector; ++++ __m128i inputVal; ++++ __m128i inputVal2; ++++ __m128 ret; ++++ ++++ for(;number < eighthPoints; number++){ ++++ ++++ // Load the 8 values ++++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++++ ++++ // Shift the input data to the right by 64 bits ( 8 bytes ) ++++ inputVal2 = _mm_srli_si128(inputVal, 8); ++++ ++++ // Convert the lower 4 values into 32 bit words ++++ inputVal = _mm_cvtepi16_epi32(inputVal); ++++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++++ ++++ ret = _mm_cvtepi32_ps(inputVal); ++++ ret = _mm_mul_ps(ret, invScalar); ++++ _mm_storeu_ps(outputVectorPtr, ret); ++++ outputVectorPtr += 4; ++++ ++++ ret = _mm_cvtepi32_ps(inputVal2); ++++ ret = _mm_mul_ps(ret, invScalar); ++++ _mm_storeu_ps(outputVectorPtr, ret); ++++ ++++ outputVectorPtr += 4; ++++ ++++ inputPtr += 8; ++++ } ++++ ++++ number = eighthPoints * 8; ++++ for(; number < num_points; number++){ ++++ outputVector[number] =((float)(inputVector[number])) / scalar; ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++ ++++ /*! ++++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++++ \param inputVector The 16 bit input data buffer ++++ \param outputVector The floating point output data buffer ++++ \param scalar The value divided against each point in the output buffer ++++ \param num_points The number of data values to be converted ++++ \note Output buffer does NOT need to be properly aligned ++++ */ ++++static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ float* outputVectorPtr = outputVector; ++++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++++ int16_t* inputPtr = (int16_t*)inputVector; ++++ __m128 ret; ++++ ++++ for(;number < quarterPoints; number++){ ++++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); ++++ ++++ ret = _mm_mul_ps(ret, invScalar); ++++ _mm_storeu_ps(outputVectorPtr, ret); ++++ ++++ inputPtr += 4; ++++ outputVectorPtr += 4; ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(; number < num_points; number++){ ++++ outputVector[number] = (float)(inputVector[number]) / scalar; ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++++ \param inputVector The 16 bit input data buffer ++++ \param outputVector The floating point output data buffer ++++ \param scalar The value divided against each point in the output buffer ++++ \param num_points The number of data values to be converted ++++ \note Output buffer does NOT need to be properly aligned ++++ */ ++++static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++++ float* outputVectorPtr = outputVector; ++++ const int16_t* inputVectorPtr = inputVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */ ++++#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H ++++#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H ++++ ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include ++++ ++++ /*! ++++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++++ \param inputVector The 16 bit input data buffer ++++ \param outputVector The floating point output data buffer ++++ \param scalar The value divided against each point in the output buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int eighthPoints = num_points / 8; ++++ ++++ float* outputVectorPtr = outputVector; ++++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++++ int16_t* inputPtr = (int16_t*)inputVector; ++++ __m128i inputVal; ++++ __m128i inputVal2; ++++ __m128 ret; ++++ ++++ for(;number < eighthPoints; number++){ ++++ ++++ // Load the 8 values ++++ inputVal = _mm_loadu_si128((__m128i*)inputPtr); ++++ ++++ // Shift the input data to the right by 64 bits ( 8 bytes ) ++++ inputVal2 = _mm_srli_si128(inputVal, 8); ++++ ++++ // Convert the lower 4 values into 32 bit words ++++ inputVal = _mm_cvtepi16_epi32(inputVal); ++++ inputVal2 = _mm_cvtepi16_epi32(inputVal2); ++++ ++++ ret = _mm_cvtepi32_ps(inputVal); ++++ ret = _mm_mul_ps(ret, invScalar); ++++ _mm_storeu_ps(outputVectorPtr, ret); ++++ outputVectorPtr += 4; ++++ ++++ ret = _mm_cvtepi32_ps(inputVal2); ++++ ret = _mm_mul_ps(ret, invScalar); ++++ _mm_storeu_ps(outputVectorPtr, ret); ++++ ++++ outputVectorPtr += 4; ++++ ++++ inputPtr += 8; ++++ } ++++ ++++ number = eighthPoints * 8; ++++ for(; number < num_points; number++){ ++++ outputVector[number] =((float)(inputVector[number])) / scalar; ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++ ++++ /*! ++++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++++ \param inputVector The 16 bit input data buffer ++++ \param outputVector The floating point output data buffer ++++ \param scalar The value divided against each point in the output buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ float* outputVectorPtr = outputVector; ++++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++++ int16_t* inputPtr = (int16_t*)inputVector; ++++ __m128 ret; ++++ ++++ for(;number < quarterPoints; number++){ ++++ ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0])); ++++ ++++ ret = _mm_mul_ps(ret, invScalar); ++++ _mm_storeu_ps(outputVectorPtr, ret); ++++ ++++ inputPtr += 4; ++++ outputVectorPtr += 4; ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(; number < num_points; number++){ ++++ outputVector[number] = (float)(inputVector[number]) / scalar; ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value ++++ \param inputVector The 16 bit input data buffer ++++ \param outputVector The floating point output data buffer ++++ \param scalar The value divided against each point in the output buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){ ++++ float* outputVectorPtr = outputVector; ++++ const int16_t* inputVectorPtr = inputVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,461 @@ ++++/*! ++++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the ++++ * real part and 16 bits the imaginary part): ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 32 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 32 bits vectors), accumulating the results ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 32 bits vectors), accumulating the results ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 32 bits vectors), accumulating the results ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++ /*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++++ ++++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ input_ptr += 4; ++++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ ++++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ carrier_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get early values ++++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ E_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ //Adds the float 32 results ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ P_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ L_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ input_ptr += 4; ++++ carrier_ptr += 4; ++++ E_code_ptr += 4; ++++ P_code_ptr += 4; ++++ L_code_ptr += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ } ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ lv_16sc_t bb_signal_sample; ++++ lv_16sc_t tmp1; ++++ lv_16sc_t tmp2; ++++ lv_16sc_t tmp3; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ ++++ tmp1 = bb_signal_sample * E_code[i]; ++++ tmp2 = bb_signal_sample * P_code[i]; ++++ tmp3 = bb_signal_sample * L_code[i]; ++++ ++++ // Now get early, late, and prompt values for each ++++ *E_out += (lv_32fc_t)tmp1; ++++ *P_out += (lv_32fc_t)tmp2; ++++ *L_out += (lv_32fc_t)tmp3; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++++ ++++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x1 = _mm_load_si128((__m128i*)input_ptr); ++++ input_ptr += 4; ++++ x2 = _mm_load_si128((__m128i*)input_ptr); ++++ ++++ y1 = _mm_load_si128((__m128i*)carrier_ptr); ++++ carrier_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get early values ++++ y1 = _mm_load_si128((__m128i*)E_code_ptr); ++++ E_code_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ //Adds the float 32 results ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y1 = _mm_load_si128((__m128i*)P_code_ptr); ++++ P_code_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y1 = _mm_load_si128((__m128i*)L_code_ptr); ++++ L_code_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ input_ptr += 4; ++++ carrier_ptr += 4; ++++ E_code_ptr += 4; ++++ P_code_ptr += 4; ++++ L_code_ptr += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ ++++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ } ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ lv_16sc_t bb_signal_sample; ++++ lv_16sc_t tmp1; ++++ lv_16sc_t tmp2; ++++ lv_16sc_t tmp3; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ ++++ tmp1 = bb_signal_sample * E_code[i]; ++++ tmp2 = bb_signal_sample * P_code[i]; ++++ tmp3 = bb_signal_sample * L_code[i]; ++++ ++++ // Now get early, late, and prompt values for each ++++ *E_out += (lv_32fc_t)tmp1; ++++ *P_out += (lv_32fc_t)tmp2; ++++ *L_out += (lv_32fc_t)tmp3; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,1568 @@ ++++/*! ++++ * \file volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Early, Prompt, and Late correlation with 32 bits vectors (16 bits the ++++ * real part and 16 bits the imaginary part): ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 32 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 32 bits vectors), accumulating the results ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 32 bits vectors), accumulating the results ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 32 bits vectors), accumulating the results ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++ /*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_first(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 4; ++++ ++++ __m128i x, y, yaux, yl, yh, tmp1, tmp2, z, bb_signal_sample, bb_signal_sample_suffled; ++++ ++++ __m128 z_ps_1, z_ps_2, z_E, z_P, z_L; ++++ __m128i z_i_1, z_i_2; ++++ ++++ lv_32fc_t dotProduct_E; ++++ lv_32fc_t dotProduct_P; ++++ lv_32fc_t dotProduct_L; ++++ ++++ z_E = _mm_setzero_ps(); ++++ z_P = _mm_setzero_ps(); ++++ z_L = _mm_setzero_ps(); ++++ ++++ const lv_16sc_t* _input = input; ++++ const lv_16sc_t* _carrier = carrier; ++++ const lv_16sc_t* _E_code = E_code; ++++ const lv_16sc_t* _P_code = P_code; ++++ const lv_16sc_t* _L_code = L_code; ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++) ++++ { ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_lddqu_si128((__m128i*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ // Load yl with cr,cr,dr,dr ++++ // Load yh with ci,ci,di,di ++++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); ++++ yl = _mm_unpacklo_epi16(yaux, yaux); ++++ yh = _mm_unpackhi_epi16(yaux, yaux); ++++ ++++ tmp1 = _mm_mullo_epi16(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_epi8 (x, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mullo_epi16(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); ++++ bb_signal_sample = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ bb_signal_sample_suffled = _mm_shuffle_epi8 (bb_signal_sample, _mm_set_epi8 (13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++++ ++++ // correlation E,P,L (3x vector scalar product) ++++ // Early ++++ y = _mm_lddqu_si128((__m128i*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); ++++ yl = _mm_unpacklo_epi16(yaux, yaux); ++++ yh = _mm_unpackhi_epi16(yaux, yaux); ++++ ++++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); ++++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_i_1 = _mm_cvtepi16_epi32(z); ++++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); ++++ z = _mm_srli_si128 (z, 8); ++++ z_i_2 = _mm_cvtepi16_epi32(z); ++++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); ++++ ++++ z_E = _mm_add_ps(z_E, z_ps_1); // Add the complex multiplication results together ++++ z_E = _mm_add_ps(z_E, z_ps_2); // Add the complex multiplication results together ++++ ++++ // Prompt ++++ y = _mm_lddqu_si128((__m128i*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); ++++ yl = _mm_unpacklo_epi16(yaux, yaux); ++++ yh = _mm_unpackhi_epi16(yaux, yaux); ++++ ++++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); ++++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_i_1 = _mm_cvtepi16_epi32(z); ++++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); ++++ z = _mm_srli_si128 (z, 8); ++++ z_i_2 = _mm_cvtepi16_epi32(z); ++++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); ++++ ++++ z_P = _mm_add_ps(z_P, z_ps_1); // Add the complex multiplication results together ++++ z_P = _mm_add_ps(z_P, z_ps_2); // Add the complex multiplication results together ++++ ++++ // Late ++++ y = _mm_lddqu_si128((__m128i*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yaux = _mm_shuffle_epi8 (y, _mm_set_epi8 (15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0)); ++++ yl = _mm_unpacklo_epi16(yaux, yaux); ++++ yh = _mm_unpackhi_epi16(yaux, yaux); ++++ ++++ tmp1 = _mm_mullo_epi16(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ tmp2 = _mm_mullo_epi16(bb_signal_sample_suffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ tmp2 = _mm_mullo_epi16(tmp2,_mm_set_epi16 (1, -1, 1, -1, 1, -1, 1, -1)); ++++ z = _mm_add_epi16(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_i_1 = _mm_cvtepi16_epi32(z); ++++ z_ps_1 = _mm_cvtepi32_ps(z_i_1); ++++ z = _mm_srli_si128 (z, 8); ++++ z_i_2 = _mm_cvtepi16_epi32(z); ++++ z_ps_2 = _mm_cvtepi32_ps(z_i_2); ++++ ++++ z_L = _mm_add_ps(z_L, z_ps_1); // Add the complex multiplication results together ++++ z_L = _mm_add_ps(z_L, z_ps_2); // Add the complex multiplication results together ++++ ++++ _input += 4; ++++ _carrier += 4; ++++ _E_code += 4; ++++ _L_code += 4; ++++ _P_code += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++++ ++++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++++ ++++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); ++++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); ++++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); ++++ } ++++ ++++ for(int i=0; i < num_points%4; ++i) ++++ { ++++ dotProduct_E += (lv_32fc_t)((*_input) * (*_E_code++)*(*_carrier)); ++++ dotProduct_P += (lv_32fc_t)((*_input) * (*_P_code++)*(*_carrier)); ++++ dotProduct_L += (lv_32fc_t)((*_input++) * (*_L_code++)*(*_carrier++)); ++++ } ++++ ++++ *E_out = dotProduct_E; ++++ *P_out = dotProduct_P; ++++ *L_out = dotProduct_L; ++++ ++++ ++++ ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_second(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++++ ++++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; ++++ __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; ++++ ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ input_ptr += 4; ++++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ ++++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ carrier_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ imagx = _mm_srli_si128 (x1, 2); ++++ imagx = _mm_blend_epi16 (x2, imagx, 85); ++++ realx = _mm_slli_si128 (x2, 2); ++++ realx = _mm_blend_epi16 (realx, x1, 85); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ //Get early values ++++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ E_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); ++++ ++++ //Get prompt values ++++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ P_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); ++++ ++++ //Get late values ++++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ L_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); ++++ ++++ input_ptr += 4; ++++ carrier_ptr += 4; ++++ E_code_ptr += 4; ++++ L_code_ptr += 4; ++++ P_code_ptr += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ } ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_third(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ unsigned int index = 0; ++++ unsigned int indexPlus4 = 0; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; ++++ ++++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(index = 0;index < 8*sse_iters; index+=8){ ++++ indexPlus4 = index + 4; ++++ //Perform the carrier wipe-off ++++ x1 = _mm_lddqu_si128((__m128i*)&input_ptr[index]); ++++ x2 = _mm_lddqu_si128((__m128i*)&input_ptr[indexPlus4]); ++++ ++++ y1 = _mm_lddqu_si128((__m128i*)&carrier_ptr[index]); ++++ y2 = _mm_lddqu_si128((__m128i*)&carrier_ptr[indexPlus4]); ++++ ++++ imagx = _mm_srli_si128 (x1, 2); ++++ imagx = _mm_blend_epi16 (x2, imagx, 85); ++++ realx = _mm_slli_si128 (x2, 2); ++++ realx = _mm_blend_epi16 (realx, x1, 85); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ //Get early values ++++ y1 = _mm_lddqu_si128((__m128i*)&E_code_ptr[index]); ++++ y2 = _mm_lddqu_si128((__m128i*)&E_code_ptr[indexPlus4]); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y1 = _mm_lddqu_si128((__m128i*)&P_code_ptr[index]); ++++ y2 = _mm_lddqu_si128((__m128i*)&P_code_ptr[indexPlus4]); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y1 = _mm_lddqu_si128((__m128i*)&L_code_ptr[index]); ++++ y2 = _mm_lddqu_si128((__m128i*)&L_code_ptr[indexPlus4]); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ } ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(; index < num_points; index++) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input_ptr[index] * carrier_ptr[index]; ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_ptr[index]); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_ptr[index]); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_ptr[index]); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fourth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, real_output_i32, imag_output_i32; ++++ ++++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ input_ptr += 4; ++++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ ++++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ carrier_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ imagx = _mm_srli_si128 (x1, 2); ++++ imagx = _mm_blend_epi16 (x2, imagx, 85); ++++ realx = _mm_slli_si128 (x2, 2); ++++ realx = _mm_blend_epi16 (realx, x1, 85); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ //Get early values ++++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ E_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ P_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ L_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y1, 2); ++++ imagy = _mm_blend_epi16 (y2, imagy, 85); ++++ realy = _mm_slli_si128 (y2, 2); ++++ realy = _mm_blend_epi16 (realy, y1, 85); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ input_ptr += 4; ++++ carrier_ptr += 4; ++++ E_code_ptr += 4; ++++ L_code_ptr += 4; ++++ P_code_ptr += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ } ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_fifth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i realx, imagx, realy, imagy, real_output, imag_output; ++++ ++++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ input_ptr += 4; ++++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ ++++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ carrier_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get early values ++++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ E_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) ++++ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ P_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) ++++ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ L_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output) ++++ ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps) ++++ CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ input_ptr += 4; ++++ carrier_ptr += 4; ++++ E_code_ptr += 4; ++++ L_code_ptr += 4; ++++ P_code_ptr += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ } ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_sse4_1_sixth(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i realx, imagx, realy, imagy, real_output, imag_output; ++++ ++++ __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ input_ptr += 4; ++++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ ++++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ carrier_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get early values ++++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ E_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ P_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ L_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ input_ptr += 4; ++++ carrier_ptr += 4; ++++ E_code_ptr += 4; ++++ L_code_ptr += 4; ++++ P_code_ptr += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ } ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ lv_16sc_t bb_signal_sample; ++++ lv_16sc_t tmp1; ++++ lv_16sc_t tmp2; ++++ lv_16sc_t tmp3; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ ++++ tmp1 = bb_signal_sample * E_code[i]; ++++ tmp2 = bb_signal_sample * P_code[i]; ++++ tmp3 = bb_signal_sample * L_code[i]; ++++ ++++ // Now get early, late, and prompt values for each ++++ *E_out += (lv_32fc_t)tmp1; ++++ *P_out += (lv_32fc_t)tmp2; ++++ *L_out += (lv_32fc_t)tmp3; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++// ++++//#ifdef LV_HAVE_SSE4_1 ++++//#include "smmintrin.h" ++++///*! ++++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++// \param input The input signal input ++++// \param carrier The carrier signal input ++++// \param E_code Early PRN code replica input ++++// \param P_code Early PRN code replica input ++++// \param L_code Early PRN code replica input ++++// \param E_out Early correlation output ++++// \param P_out Early correlation output ++++// \param L_out Early correlation output ++++// \param num_points The number of complex values in vectors ++++// */ ++++//static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++//{ ++++// const unsigned int sse_iters = num_points / 8; ++++// ++++// __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++// __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++++// ++++// __m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc; ++++// __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2; ++++// __m128 real_output_ps_1, real_output_ps_2, imag_output_ps_1, imag_output_ps_2; ++++// ++++// float E_out_real = 0; ++++// float E_out_imag = 0; ++++// float P_out_real = 0; ++++// float P_out_imag = 0; ++++// float L_out_real = 0; ++++// float L_out_imag = 0; ++++// ++++// const lv_16sc_t* input_ptr = input; ++++// const lv_16sc_t* carrier_ptr = carrier; ++++// ++++// const lv_16sc_t* E_code_ptr = E_code; ++++// lv_32fc_t* E_out_ptr = E_out; ++++// const lv_16sc_t* L_code_ptr = L_code; ++++// lv_32fc_t* L_out_ptr = L_out; ++++// const lv_16sc_t* P_code_ptr = P_code; ++++// lv_32fc_t* P_out_ptr = P_out; ++++// ++++// *E_out_ptr = 0; ++++// *P_out_ptr = 0; ++++// *L_out_ptr = 0; ++++// ++++// mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++// ++++// real_E_code_acc = _mm_setzero_ps(); ++++// imag_E_code_acc = _mm_setzero_ps(); ++++// real_P_code_acc = _mm_setzero_ps(); ++++// imag_P_code_acc = _mm_setzero_ps(); ++++// real_L_code_acc = _mm_setzero_ps(); ++++// imag_L_code_acc = _mm_setzero_ps(); ++++// ++++// if (sse_iters>0) ++++// { ++++// for(int number = 0;number < sse_iters; number++){ ++++// ++++// //Perform the carrier wipe-off ++++// x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++++// input_ptr += 4; ++++// x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++++// ++++// y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++// carrier_ptr += 4; ++++// y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++// ++++// imagx = _mm_srli_si128 (x1, 2); ++++// imagx = _mm_blend_epi16 (x2, imagx, 85); ++++// realx = _mm_slli_si128 (x2, 2); ++++// realx = _mm_blend_epi16 (realx, x1, 85); ++++// ++++// imagy = _mm_srli_si128 (y1, 2); ++++// imagy = _mm_blend_epi16 (y2, imagy, 85); ++++// realy = _mm_slli_si128 (y2, 2); ++++// realy = _mm_blend_epi16 (realy, y1, 85); ++++// ++++// realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++// imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++// realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++// imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++// ++++// real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++// imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++// ++++// //Get early values ++++// y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++// E_code_ptr += 4; ++++// y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++// ++++// imagy = _mm_srli_si128 (y1, 2); ++++// imagy = _mm_blend_epi16 (y2, imagy, 85); ++++// realy = _mm_slli_si128 (y2, 2); ++++// realy = _mm_blend_epi16 (realy, y1, 85); ++++// ++++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++// ++++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++// ++++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++++// real_output = _mm_srli_si128 (real_output, 8); ++++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++++// ++++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++++// imag_output = _mm_srli_si128 (imag_output, 8); ++++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++++// ++++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_1); ++++// real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps_2); ++++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_1); ++++// imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps_2); ++++// ++++// //Get prompt values ++++// y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++// P_code_ptr += 4; ++++// y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++// ++++// imagy = _mm_srli_si128 (y1, 2); ++++// imagy = _mm_blend_epi16 (y2, imagy, 85); ++++// realy = _mm_slli_si128 (y2, 2); ++++// realy = _mm_blend_epi16 (realy, y1, 85); ++++// ++++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++// ++++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++// ++++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++++// real_output = _mm_srli_si128 (real_output, 8); ++++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++++// ++++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++++// imag_output = _mm_srli_si128 (imag_output, 8); ++++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++++// ++++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_1); ++++// real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps_2); ++++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_1); ++++// imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps_2); ++++// ++++// //Get late values ++++// y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++// L_code_ptr += 4; ++++// y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++// ++++// imagy = _mm_srli_si128 (y1, 2); ++++// imagy = _mm_blend_epi16 (y2, imagy, 85); ++++// realy = _mm_slli_si128 (y2, 2); ++++// realy = _mm_blend_epi16 (realy, y1, 85); ++++// ++++// realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++// imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++// realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++// imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++// ++++// real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++// imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++// ++++// real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++// real_output_ps_1 = _mm_cvtepi32_ps(real_output_i_1); ++++// real_output = _mm_srli_si128 (real_output, 8); ++++// real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++// real_output_ps_2 = _mm_cvtepi32_ps(real_output_i_2); ++++// ++++// imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++// imag_output_ps_1 = _mm_cvtepi32_ps(imag_output_i_1); ++++// imag_output = _mm_srli_si128 (imag_output, 8); ++++// imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++// imag_output_ps_2 = _mm_cvtepi32_ps(imag_output_i_2); ++++// ++++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_1); ++++// real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps_2); ++++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_1); ++++// imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps_2); ++++// ++++// input_ptr += 4; ++++// carrier_ptr += 4; ++++// E_code_ptr += 4; ++++// L_code_ptr += 4; ++++// P_code_ptr += 4; ++++// } ++++// ++++// __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++// __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++// __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++// __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++// __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++// __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++// ++++// _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++// _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++// _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++// _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++// _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++// _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++// ++++// for (int i = 0; i<4; ++i) ++++// { ++++// E_out_real += real_E_dotProductVector[i]; ++++// E_out_imag += imag_E_dotProductVector[i]; ++++// P_out_real += real_P_dotProductVector[i]; ++++// P_out_imag += imag_P_dotProductVector[i]; ++++// L_out_real += real_L_dotProductVector[i]; ++++// L_out_imag += imag_L_dotProductVector[i]; ++++// } ++++// *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++// *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++// *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++// } ++++// ++++// lv_16sc_t bb_signal_sample; ++++// for(int i=0; i < num_points%8; ++i) ++++// { ++++// //Perform the carrier wipe-off ++++// bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++// // Now get early, late, and prompt values for each ++++// *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++// *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++// *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++// } ++++//} ++++//#endif /* LV_HAVE_SSE4_1 */ ++++// ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points) ++++{ ++++ lv_16sc_t bb_signal_sample; ++++ lv_16sc_t tmp1; ++++ lv_16sc_t tmp2; ++++ lv_16sc_t tmp3; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ ++++ tmp1 = bb_signal_sample * E_code[i]; ++++ tmp2 = bb_signal_sample * P_code[i]; ++++ tmp3 = bb_signal_sample * L_code[i]; ++++ ++++ // Now get early, late, and prompt values for each ++++ *E_out += (lv_32fc_t)tmp1; ++++ *P_out += (lv_32fc_t)tmp2; ++++ *L_out += (lv_32fc_t)tmp3; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,595 @@ ++++/*! ++++ * \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the ++++ * real part and 16 bits the imaginary part) and accumulates into float32 values, returning them: ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 32 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Very Early values are calculated by multiplying the input signal in BB by the ++++ * very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++++ * - Very Late values are calculated by multiplying the input signal in BB by the ++++ * very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++ /*! ++++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++++ ++++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ float VE_out_real = 0; ++++ float VE_out_imag = 0; ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ float VL_out_real = 0; ++++ float VL_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_16sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ *VE_out_ptr = 0; ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ *VL_out_ptr = 0; ++++ ++++ real_VE_code_acc = _mm_setzero_ps(); ++++ imag_VE_code_acc = _mm_setzero_ps(); ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ real_VL_code_acc = _mm_setzero_ps(); ++++ imag_VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x1 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ input_ptr += 4; ++++ x2 = _mm_lddqu_si128((__m128i*)input_ptr); ++++ ++++ y1 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ carrier_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get very early values ++++ y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ VE_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++++ ++++ //Get early values ++++ y1 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ E_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y1 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ P_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y1 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ L_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ //Get very late values ++++ y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ VL_code_ptr += 4; ++++ y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++++ ++++ input_ptr += 4; ++++ carrier_ptr += 4; ++++ VE_code_ptr += 4; ++++ E_code_ptr += 4; ++++ P_code_ptr += 4; ++++ L_code_ptr += 4; ++++ VL_code_ptr += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ VE_out_real += real_VE_dotProductVector[i]; ++++ VE_out_imag += imag_VE_dotProductVector[i]; ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ VL_out_real += real_VL_dotProductVector[i]; ++++ VL_out_imag += imag_VL_dotProductVector[i]; ++++ } ++++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) ++++{ ++++ lv_16sc_t bb_signal_sample; ++++ lv_16sc_t tmp1; ++++ lv_16sc_t tmp2; ++++ lv_16sc_t tmp3; ++++ lv_16sc_t tmp4; ++++ lv_16sc_t tmp5; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ ++++ tmp1 = bb_signal_sample * VE_code[i]; ++++ tmp2 = bb_signal_sample * E_code[i]; ++++ tmp3 = bb_signal_sample * P_code[i]; ++++ tmp4 = bb_signal_sample * L_code[i]; ++++ tmp5 = bb_signal_sample * VL_code[i]; ++++ ++++ // Now get early, late, and prompt values for each ++++ *VE_out += (lv_32fc_t)tmp1; ++++ *E_out += (lv_32fc_t)tmp2; ++++ *P_out += (lv_32fc_t)tmp3; ++++ *L_out += (lv_32fc_t)tmp4; ++++ *VL_out += (lv_32fc_t)tmp5; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output; ++++ ++++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ float VE_out_real = 0; ++++ float VE_out_imag = 0; ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ float VL_out_real = 0; ++++ float VL_out_imag = 0; ++++ ++++ const lv_16sc_t* input_ptr = input; ++++ const lv_16sc_t* carrier_ptr = carrier; ++++ ++++ const lv_16sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_16sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_16sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_16sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_16sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ *VE_out_ptr = 0; ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ *VL_out_ptr = 0; ++++ ++++ real_VE_code_acc = _mm_setzero_ps(); ++++ imag_VE_code_acc = _mm_setzero_ps(); ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ real_VL_code_acc = _mm_setzero_ps(); ++++ imag_VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x1 = _mm_load_si128((__m128i*)input_ptr); ++++ input_ptr += 4; ++++ x2 = _mm_load_si128((__m128i*)input_ptr); ++++ ++++ y1 = _mm_load_si128((__m128i*)carrier_ptr); ++++ carrier_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx) ++++ CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy) ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get very early values ++++ y1 = _mm_load_si128((__m128i*)VE_code_ptr); ++++ VE_code_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++++ ++++ //Get early values ++++ y1 = _mm_load_si128((__m128i*)E_code_ptr); ++++ E_code_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y1 = _mm_load_si128((__m128i*)P_code_ptr); ++++ P_code_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y1 = _mm_load_si128((__m128i*)L_code_ptr); ++++ L_code_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ //Get very late values ++++ y1 = _mm_load_si128((__m128i*)VL_code_ptr); ++++ VL_code_ptr += 4; ++++ y2 = _mm_load_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++++ ++++ input_ptr += 4; ++++ carrier_ptr += 4; ++++ VE_code_ptr += 4; ++++ E_code_ptr += 4; ++++ P_code_ptr += 4; ++++ L_code_ptr += 4; ++++ VL_code_ptr += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++++ ++++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ VE_out_real += real_VE_dotProductVector[i]; ++++ VE_out_imag += imag_VE_dotProductVector[i]; ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ VL_out_real += real_VL_dotProductVector[i]; ++++ VL_out_imag += imag_VL_dotProductVector[i]; ++++ } ++++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points) ++++{ ++++ lv_16sc_t bb_signal_sample; ++++ lv_16sc_t tmp1; ++++ lv_16sc_t tmp2; ++++ lv_16sc_t tmp3; ++++ lv_16sc_t tmp4; ++++ lv_16sc_t tmp5; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ ++++ tmp1 = bb_signal_sample * VE_code[i]; ++++ tmp2 = bb_signal_sample * E_code[i]; ++++ tmp3 = bb_signal_sample * P_code[i]; ++++ tmp4 = bb_signal_sample * L_code[i]; ++++ tmp5 = bb_signal_sample * VL_code[i]; ++++ ++++ // Now get early, late, and prompt values for each ++++ *VE_out += (lv_32fc_t)tmp1; ++++ *E_out += (lv_32fc_t)tmp2; ++++ *P_out += (lv_32fc_t)tmp3; ++++ *L_out += (lv_32fc_t)tmp4; ++++ *VL_out += (lv_32fc_t)tmp5; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_accumulator_s32f.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,68 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H ++++#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++*/ ++++static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){ ++++ float returnValue = 0; ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ const float* aPtr = inputBuffer; ++++ __VOLK_ATTR_ALIGNED(16) float tempBuffer[4]; ++++ ++++ __m128 accumulator = _mm_setzero_ps(); ++++ __m128 aVal = _mm_setzero_ps(); ++++ ++++ for(;number < quarterPoints; number++){ ++++ aVal = _mm_load_ps(aPtr); ++++ accumulator = _mm_add_ps(accumulator, aVal); ++++ aPtr += 4; ++++ } ++++ _mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container ++++ returnValue = tempBuffer[0]; ++++ returnValue += tempBuffer[1]; ++++ returnValue += tempBuffer[2]; ++++ returnValue += tempBuffer[3]; ++++ ++++ number = quarterPoints * 4; ++++ for(;number < num_points; number++){ ++++ returnValue += (*aPtr++); ++++ } ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++*/ ++++static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){ ++++ const float* aPtr = inputBuffer; ++++ unsigned int number = 0; ++++ float returnValue = 0; ++++ ++++ for(;number < num_points; number++){ ++++ returnValue += (*aPtr++); ++++ } ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,149 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H ++++#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include ++++ ++++static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ float* inputPtr = (float*)src0; ++++ ++++ __m128 indexIncrementValues = _mm_set1_ps(4); ++++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); ++++ ++++ float max = src0[0]; ++++ float index = 0; ++++ __m128 maxValues = _mm_set1_ps(max); ++++ __m128 maxValuesIndex = _mm_setzero_ps(); ++++ __m128 compareResults; ++++ __m128 currentValues; ++++ ++++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++++ ++++ for(;number < quarterPoints; number++){ ++++ ++++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4; ++++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++++ ++++ compareResults = _mm_cmpgt_ps(maxValues, currentValues); ++++ ++++ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); ++++ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); ++++ } ++++ ++++ // Calculate the largest value from the remaining 4 points ++++ _mm_store_ps(maxValuesBuffer, maxValues); ++++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); ++++ ++++ for(number = 0; number < 4; number++){ ++++ if(maxValuesBuffer[number] > max){ ++++ index = maxIndexesBuffer[number]; ++++ max = maxValuesBuffer[number]; ++++ } ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(;number < num_points; number++){ ++++ if(src0[number] > max){ ++++ index = number; ++++ max = src0[number]; ++++ } ++++ } ++++ target[0] = (unsigned int)index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++ ++++static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ float* inputPtr = (float*)src0; ++++ ++++ __m128 indexIncrementValues = _mm_set1_ps(4); ++++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); ++++ ++++ float max = src0[0]; ++++ float index = 0; ++++ __m128 maxValues = _mm_set1_ps(max); ++++ __m128 maxValuesIndex = _mm_setzero_ps(); ++++ __m128 compareResults; ++++ __m128 currentValues; ++++ ++++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; ++++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; ++++ ++++ for(;number < quarterPoints; number++){ ++++ ++++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4; ++++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); ++++ ++++ compareResults = _mm_cmpgt_ps(maxValues, currentValues); ++++ ++++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); ++++ ++++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); ++++ } ++++ ++++ // Calculate the largest value from the remaining 4 points ++++ _mm_store_ps(maxValuesBuffer, maxValues); ++++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex); ++++ ++++ for(number = 0; number < 4; number++){ ++++ if(maxValuesBuffer[number] > max){ ++++ index = maxIndexesBuffer[number]; ++++ max = maxValuesBuffer[number]; ++++ } ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(;number < num_points; number++){ ++++ if(src0[number] > max){ ++++ index = number; ++++ max = src0[number]; ++++ } ++++ } ++++ target[0] = (unsigned int)index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE*/ ++++ ++++#ifdef LV_HAVE_GENERIC ++++static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ float max = src0[0]; ++++ unsigned int index = 0; ++++ ++++ unsigned int i = 1; ++++ ++++ for(; i < num_points; ++i) { ++++ ++++ if(src0[i] > max){ ++++ index = i; ++++ max = src0[i]; ++++ } ++++ ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_s32f_convert_16i.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,302 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H ++++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++ /*! ++++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param scalar The value multiplied against each point in the input buffer ++++ \param num_points The number of data values to be converted ++++ \note Input buffer does NOT need to be properly aligned ++++ */ ++++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ ++++ const unsigned int eighthPoints = num_points / 8; ++++ ++++ const float* inputVectorPtr = (const float*)inputVector; ++++ int16_t* outputVectorPtr = outputVector; ++++ ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ float r; ++++ ++++ __m128 vScalar = _mm_set_ps1(scalar); ++++ __m128 inputVal1, inputVal2; ++++ __m128i intInputVal1, intInputVal2; ++++ __m128 ret1, ret2; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(;number < eighthPoints; number++){ ++++ inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4; ++++ ++++ // Scale and clip ++++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ ++++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++++ outputVectorPtr += 8; ++++ } ++++ ++++ number = eighthPoints * 8; ++++ for(; number < num_points; number++){ ++++ r = inputVector[number] * scalar; ++++ if(r > max_val) ++++ r = max_val; ++++ else if(r < min_val) ++++ r = min_val; ++++ outputVector[number] = (int16_t)rintf(r); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++ /*! ++++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param scalar The value multiplied against each point in the input buffer ++++ \param num_points The number of data values to be converted ++++ \note Input buffer does NOT need to be properly aligned ++++ */ ++++static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ const float* inputVectorPtr = (const float*)inputVector; ++++ int16_t* outputVectorPtr = outputVector; ++++ ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ float r; ++++ ++++ __m128 vScalar = _mm_set_ps1(scalar); ++++ __m128 ret; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++++ ++++ for(;number < quarterPoints; number++){ ++++ ret = _mm_loadu_ps(inputVectorPtr); ++++ inputVectorPtr += 4; ++++ ++++ // Scale and clip ++++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++++ ++++ _mm_store_ps(outputFloatBuffer, ret); ++++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(; number < num_points; number++){ ++++ r = inputVector[number] * scalar; ++++ if(r > max_val) ++++ r = max_val; ++++ else if(r < min_val) ++++ r = min_val; ++++ outputVector[number] = (int16_t)rintf(r); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param scalar The value multiplied against each point in the input buffer ++++ \param num_points The number of data values to be converted ++++ \note Input buffer does NOT need to be properly aligned ++++ */ ++++static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++++ int16_t* outputVectorPtr = outputVector; ++++ const float* inputVectorPtr = inputVector; ++++ unsigned int number = 0; ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ float r; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ r = *inputVectorPtr++ * scalar; ++++ if(r > max_val) ++++ r = max_val; ++++ else if(r < min_val) ++++ r = min_val; ++++ *outputVectorPtr++ = (int16_t)rintf(r); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */ ++++#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H ++++#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++ /*! ++++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param scalar The value multiplied against each point in the input buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ ++++ const unsigned int eighthPoints = num_points / 8; ++++ ++++ const float* inputVectorPtr = (const float*)inputVector; ++++ int16_t* outputVectorPtr = outputVector; ++++ ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ float r; ++++ ++++ __m128 vScalar = _mm_set_ps1(scalar); ++++ __m128 inputVal1, inputVal2; ++++ __m128i intInputVal1, intInputVal2; ++++ __m128 ret1, ret2; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(;number < eighthPoints; number++){ ++++ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ++++ ++++ // Scale and clip ++++ ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ ++++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++++ outputVectorPtr += 8; ++++ } ++++ ++++ number = eighthPoints * 8; ++++ for(; number < num_points; number++){ ++++ r = inputVector[number] * scalar; ++++ if(r > max_val) ++++ r = max_val; ++++ else if(r < min_val) ++++ r = min_val; ++++ outputVector[number] = (int16_t)rintf(r); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++ /*! ++++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param scalar The value multiplied against each point in the input buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ const float* inputVectorPtr = (const float*)inputVector; ++++ int16_t* outputVectorPtr = outputVector; ++++ ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ float r; ++++ ++++ __m128 vScalar = _mm_set_ps1(scalar); ++++ __m128 ret; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4]; ++++ ++++ for(;number < quarterPoints; number++){ ++++ ret = _mm_load_ps(inputVectorPtr); ++++ inputVectorPtr += 4; ++++ ++++ // Scale and clip ++++ ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val); ++++ ++++ _mm_store_ps(outputFloatBuffer, ret); ++++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]); ++++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]); ++++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]); ++++ *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]); ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(; number < num_points; number++){ ++++ r = inputVector[number] * scalar; ++++ if(r > max_val) ++++ r = max_val; ++++ else if(r < min_val) ++++ r = min_val; ++++ outputVector[number] = (int16_t)rintf(r); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param scalar The value multiplied against each point in the input buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){ ++++ int16_t* outputVectorPtr = outputVector; ++++ const float* inputVectorPtr = inputVector; ++++ unsigned int number = 0; ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ float r; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ r = *inputVectorPtr++ * scalar; ++++ if(r < min_val) ++++ r = min_val; ++++ else if(r > max_val) ++++ r = max_val; ++++ *outputVectorPtr++ = (int16_t)rintf(r); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32f_x2_add_32f.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,147 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H ++++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H ++++ ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++*/ ++++static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ float* cPtr = cVector; ++++ const float* aPtr = aVector; ++++ const float* bPtr= bVector; ++++ ++++ __m128 aVal, bVal, cVal; ++++ for(;number < quarterPoints; number++){ ++++ ++++ aVal = _mm_loadu_ps(aPtr); ++++ bVal = _mm_loadu_ps(bPtr); ++++ ++++ cVal = _mm_add_ps(aVal, bVal); ++++ ++++ _mm_storeu_ps(cPtr,cVal); // Store the results back into the C container ++++ ++++ aPtr += 4; ++++ bPtr += 4; ++++ cPtr += 4; ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(;number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) + (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++*/ ++++static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++++ float* cPtr = cVector; ++++ const float* aPtr = aVector; ++++ const float* bPtr= bVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) + (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */ ++++#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H ++++#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H ++++ ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++*/ ++++static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ float* cPtr = cVector; ++++ const float* aPtr = aVector; ++++ const float* bPtr= bVector; ++++ ++++ __m128 aVal, bVal, cVal; ++++ for(;number < quarterPoints; number++){ ++++ ++++ aVal = _mm_load_ps(aPtr); ++++ bVal = _mm_load_ps(bPtr); ++++ ++++ cVal = _mm_add_ps(aVal, bVal); ++++ ++++ _mm_store_ps(cPtr,cVal); // Store the results back into the C container ++++ ++++ aPtr += 4; ++++ bPtr += 4; ++++ cPtr += 4; ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(;number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) + (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++*/ ++++static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++++ float* cPtr = cVector; ++++ const float* aPtr = aVector; ++++ const float* bPtr= bVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) + (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++*/ ++++extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points); ++++static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){ ++++ volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_conjugate_32fc.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,127 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ /*! ++++ \brief Takes the conjugate of a complex vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ __m128 x; ++++ lv_32fc_t* c = cVector; ++++ const lv_32fc_t* a = aVector; ++++ ++++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); ++++ ++++ for(;number < halfPoints; number++){ ++++ ++++ x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi ++++ ++++ x = _mm_xor_ps(x, conjugator); // conjugate register ++++ ++++ _mm_storeu_ps((float*)c,x); // Store the results back into the C container ++++ ++++ a += 2; ++++ c += 2; ++++ } ++++ ++++ if((num_points % 2) != 0) { ++++ *c = lv_conj(*a); ++++ } ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Takes the conjugate of a complex vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ ++++ lv_32fc_t* cPtr = cVector; ++++ const lv_32fc_t* aPtr = aVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = lv_conj(*aPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ /*! ++++ \brief Takes the conjugate of a complex vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ __m128 x; ++++ lv_32fc_t* c = cVector; ++++ const lv_32fc_t* a = aVector; ++++ ++++ __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); ++++ ++++ for(;number < halfPoints; number++){ ++++ ++++ x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi ++++ ++++ x = _mm_xor_ps(x, conjugator); // conjugate register ++++ ++++ _mm_store_ps((float*)c,x); // Store the results back into the C container ++++ ++++ a += 2; ++++ c += 2; ++++ } ++++ ++++ if((num_points % 2) != 0) { ++++ *c = lv_conj(*a); ++++ } ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Takes the conjugate of a complex vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){ ++++ lv_32fc_t* cPtr = cVector; ++++ const lv_32fc_t* aPtr = aVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = lv_conj(*aPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,295 @@ ++++/*! ++++ * \file volk_gnsssdr_32fc_convert_16ic.h ++++ * \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points/4; ++++ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++++ ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ ++++ __m128 inputVal1, inputVal2; ++++ __m128i intInputVal1, intInputVal2; ++++ __m128 ret1, ret2; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(unsigned int i = 0;i < sse_iters; i++){ ++++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ ++++ // Clip ++++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ ++++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++++ outputVectorPtr += 8; ++++ } ++++ ++++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points/4; ++++ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++++ ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ ++++ __m128 inputVal1, inputVal2; ++++ __m128i intInputVal1, intInputVal2; ++++ __m128 ret1, ret2; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(unsigned int i = 0;i < sse_iters; i++){ ++++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ ++++ // Clip ++++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ ++++ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1); ++++ outputVectorPtr += 8; ++++ } ++++ ++++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ ++++ for(unsigned int i = 0; i < num_points*2; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points/4; ++++ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++++ ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ ++++ __m128 inputVal1, inputVal2; ++++ __m128i intInputVal1, intInputVal2; ++++ __m128 ret1, ret2; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(unsigned int i = 0;i < sse_iters; i++){ ++++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ ++++ // Clip ++++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ ++++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++++ outputVectorPtr += 8; ++++ } ++++ ++++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points/4; ++++ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++++ ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ ++++ __m128 inputVal1, inputVal2; ++++ __m128i intInputVal1, intInputVal2; ++++ __m128 ret1, ret2; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(unsigned int i = 0;i < sse_iters; i++){ ++++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ ++++ // Clip ++++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ ++++ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); ++++ outputVectorPtr += 8; ++++ } ++++ ++++ for(unsigned int i = 0; i < (num_points%4)*2; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int16_t* outputVectorPtr = (int16_t*)outputVector; ++++ float min_val = -32768; ++++ float max_val = 32767; ++++ ++++ for(unsigned int i = 0; i < num_points*2; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,213 @@ ++++/*! ++++ * \file volk_gnsssdr_32fc_convert_8ic.h ++++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points/8; ++++ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++++ ++++ float min_val = -128; ++++ float max_val = 127; ++++ ++++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++++ __m128i int8InputVal; ++++ __m128 ret1, ret2, ret3, ret4; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(unsigned int i = 0;i < sse_iters; i++){ ++++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ ++++ // Clip ++++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ++++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ intInputVal3 = _mm_cvtps_epi32(ret3); ++++ intInputVal4 = _mm_cvtps_epi32(ret4); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); ++++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); ++++ ++++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); ++++ outputVectorPtr += 16; ++++ } ++++ ++++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++++ float min_val = -128; ++++ float max_val = 127; ++++ ++++ for(unsigned int i = 0; i < num_points*2; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points/8; ++++ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++++ ++++ float min_val = -128; ++++ float max_val = 127; ++++ ++++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++++ __m128i int8InputVal; ++++ __m128 ret1, ret2, ret3, ret4; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(unsigned int i = 0;i < sse_iters; i++){ ++++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ ++++ // Clip ++++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ++++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ intInputVal3 = _mm_cvtps_epi32(ret3); ++++ intInputVal4 = _mm_cvtps_epi32(ret4); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); ++++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); ++++ ++++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); ++++ outputVectorPtr += 16; ++++ } ++++ ++++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++++ float min_val = -128; ++++ float max_val = 127; ++++ ++++ for(unsigned int i = 0; i < num_points*2; i++){ ++++ if(inputVectorPtr[i] > max_val) ++++ inputVectorPtr[i] = max_val; ++++ else if(inputVectorPtr[i] < min_val) ++++ inputVectorPtr[i] = min_val; ++++ outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_magnitude_squared_32f.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,228 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ /*! ++++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ const float* complexVectorPtr = (float*)complexVector; ++++ float* magnitudeVectorPtr = magnitudeVector; ++++ ++++ __m128 cplxValue1, cplxValue2, result; ++++ for(;number < quarterPoints; number++){ ++++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++++ complexVectorPtr += 4; ++++ ++++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++++ complexVectorPtr += 4; ++++ ++++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++++ ++++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++++ ++++ _mm_storeu_ps(magnitudeVectorPtr, result); ++++ magnitudeVectorPtr += 4; ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(; number < num_points; number++){ ++++ float val1Real = *complexVectorPtr++; ++++ float val1Imag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++++ } ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++ /*! ++++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ const float* complexVectorPtr = (float*)complexVector; ++++ float* magnitudeVectorPtr = magnitudeVector; ++++ ++++ __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++++ for(;number < quarterPoints; number++){ ++++ cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++++ complexVectorPtr += 4; ++++ ++++ cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++++ complexVectorPtr += 4; ++++ ++++ // Arrange in i1i2i3i4 format ++++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++++ // Arrange in q1q2q3q4 format ++++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++++ ++++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++++ ++++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++++ ++++ _mm_storeu_ps(magnitudeVectorPtr, result); ++++ magnitudeVectorPtr += 4; ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(; number < num_points; number++){ ++++ float val1Real = *complexVectorPtr++; ++++ float val1Imag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++++ const float* complexVectorPtr = (float*)complexVector; ++++ float* magnitudeVectorPtr = magnitudeVector; ++++ unsigned int number = 0; ++++ for(number = 0; number < num_points; number++){ ++++ const float real = *complexVectorPtr++; ++++ const float imag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ /*! ++++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ const float* complexVectorPtr = (float*)complexVector; ++++ float* magnitudeVectorPtr = magnitudeVector; ++++ ++++ __m128 cplxValue1, cplxValue2, result; ++++ for(;number < quarterPoints; number++){ ++++ cplxValue1 = _mm_load_ps(complexVectorPtr); ++++ complexVectorPtr += 4; ++++ ++++ cplxValue2 = _mm_load_ps(complexVectorPtr); ++++ complexVectorPtr += 4; ++++ ++++ cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values ++++ cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values ++++ ++++ result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values ++++ ++++ _mm_store_ps(magnitudeVectorPtr, result); ++++ magnitudeVectorPtr += 4; ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(; number < num_points; number++){ ++++ float val1Real = *complexVectorPtr++; ++++ float val1Imag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++++ } ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_SSE ++++#include ++++ /*! ++++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int quarterPoints = num_points / 4; ++++ ++++ const float* complexVectorPtr = (float*)complexVector; ++++ float* magnitudeVectorPtr = magnitudeVector; ++++ ++++ __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++++ for(;number < quarterPoints; number++){ ++++ cplxValue1 = _mm_load_ps(complexVectorPtr); ++++ complexVectorPtr += 4; ++++ ++++ cplxValue2 = _mm_load_ps(complexVectorPtr); ++++ complexVectorPtr += 4; ++++ ++++ // Arrange in i1i2i3i4 format ++++ iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++++ // Arrange in q1q2q3q4 format ++++ qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++++ ++++ iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++++ qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++++ ++++ result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++++ ++++ _mm_store_ps(magnitudeVectorPtr, result); ++++ magnitudeVectorPtr += 4; ++++ } ++++ ++++ number = quarterPoints * 4; ++++ for(; number < num_points; number++){ ++++ float val1Real = *complexVectorPtr++; ++++ float val1Imag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++++ const float* complexVectorPtr = (float*)complexVector; ++++ float* magnitudeVectorPtr = magnitudeVector; ++++ unsigned int number = 0; ++++ for(number = 0; number < num_points; number++){ ++++ const float real = *complexVectorPtr++; ++++ const float imag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_convert_8ic.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,231 @@ ++++/*! ++++ * \file volk_gnsssdr_32fc_s32f_convert_8ic.h ++++ * \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points/8; ++++ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++++ ++++ float min_val = -128; ++++ float max_val = 127; ++++ ++++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++++ __m128i int8InputVal; ++++ __m128 ret1, ret2, ret3, ret4; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(unsigned int i = 0;i < sse_iters; i++){ ++++ inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ ++++ inputVal1 = _mm_mul_ps(inputVal1, invScalar); ++++ inputVal2 = _mm_mul_ps(inputVal2, invScalar); ++++ inputVal3 = _mm_mul_ps(inputVal3, invScalar); ++++ inputVal4 = _mm_mul_ps(inputVal4, invScalar); ++++ // Clip ++++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ++++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ intInputVal3 = _mm_cvtps_epi32(ret3); ++++ intInputVal4 = _mm_cvtps_epi32(ret4); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); ++++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); ++++ ++++ _mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal); ++++ outputVectorPtr += 16; ++++ } ++++ ++++ float scaled = 0; ++++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ ++++ scaled = inputVectorPtr[i]/scalar; ++++ if(scaled > max_val) ++++ scaled = max_val; ++++ else if(scaled < min_val) ++++ scaled = min_val; ++++ outputVectorPtr[i] = (int8_t)rintf(scaled); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++++ float scaled = 0; ++++ float min_val = -128; ++++ float max_val = 127; ++++ ++++ for(unsigned int i = 0; i < num_points*2; i++){ ++++ scaled = (inputVectorPtr[i])/scalar; ++++ if(scaled > max_val) ++++ scaled = max_val; ++++ else if(scaled < min_val) ++++ scaled = min_val; ++++ outputVectorPtr[i] = (int8_t)rintf(scaled); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points/8; ++++ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++++ __m128 invScalar = _mm_set_ps1(1.0/scalar); ++++ ++++ float min_val = -128; ++++ float max_val = 127; ++++ ++++ __m128 inputVal1, inputVal2, inputVal3, inputVal4; ++++ __m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4; ++++ __m128i int8InputVal; ++++ __m128 ret1, ret2, ret3, ret4; ++++ __m128 vmin_val = _mm_set_ps1(min_val); ++++ __m128 vmax_val = _mm_set_ps1(max_val); ++++ ++++ for(unsigned int i = 0;i < sse_iters; i++){ ++++ inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; ++++ ++++ inputVal1 = _mm_mul_ps(inputVal1, invScalar); ++++ inputVal2 = _mm_mul_ps(inputVal2, invScalar); ++++ inputVal3 = _mm_mul_ps(inputVal3, invScalar); ++++ inputVal4 = _mm_mul_ps(inputVal4, invScalar); ++++ // Clip ++++ ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val); ++++ ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val); ++++ ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val); ++++ ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val); ++++ ++++ intInputVal1 = _mm_cvtps_epi32(ret1); ++++ intInputVal2 = _mm_cvtps_epi32(ret2); ++++ intInputVal3 = _mm_cvtps_epi32(ret3); ++++ intInputVal4 = _mm_cvtps_epi32(ret4); ++++ ++++ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); ++++ intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4); ++++ int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2); ++++ ++++ _mm_store_si128((__m128i*)outputVectorPtr, int8InputVal); ++++ outputVectorPtr += 16; ++++ } ++++ ++++ float scaled = 0; ++++ for(unsigned int i = 0; i < (num_points%4)*4; i++){ ++++ scaled = inputVectorPtr[i]/scalar; ++++ if(scaled > max_val) ++++ scaled = max_val; ++++ else if(scaled < min_val) ++++ scaled = min_val; ++++ outputVectorPtr[i] = (int8_t)rintf(scaled); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part) ++++ \param inputVector The floating point input data buffer ++++ \param outputVector The 16 bit output data buffer ++++ \param num_points The number of data values to be converted ++++ */ ++++static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){ ++++ float* inputVectorPtr = (float*)inputVector; ++++ int8_t* outputVectorPtr = (int8_t*)outputVector; ++++ float scaled = 0; ++++ float min_val = -128; ++++ float max_val = 127; ++++ ++++ for(unsigned int i = 0; i < num_points*2; i++){ ++++ scaled = inputVectorPtr[i]/scalar; ++++ if(scaled > max_val) ++++ scaled = max_val; ++++ else if(scaled < min_val) ++++ scaled = min_val; ++++ outputVectorPtr[i] = (int8_t)rintf(scaled); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,266 @@ ++++/*! ++++ * \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc ++++ * \brief Volk protokernel: replaces the tracking function for update_local_code ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that replaces the tracking function for update_local_code ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include ++++ /*! ++++ \brief Takes the conjugate of a complex vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ ++++ ++++// float* pointer1 = (float*)&d_very_early_late_spc_chips; ++++// *pointer1 = 1; ++++// float* pointer2 = (float*)&code_length_half_chips; ++++// *pointer2 = 6; ++++// float* pointer3 = (float*)&code_phase_step_half_chips; ++++// *pointer3 = 7; ++++// float* pointer4 = (float*)&tcode_half_chips_input; ++++// *pointer4 = 8; ++++ ++++ const unsigned int sse_iters = num_points / 4; ++++ ++++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; ++++ ++++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); ++++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); ++++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); ++++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); ++++ __m128 twos = _mm_set1_ps (2); ++++ __m128i associated_chip_index_array_int; ++++ ++++ __VOLK_ATTR_ALIGNED(16) int32_t output[4]; ++++ ++++ for (unsigned int i = 0; i < sse_iters; i++) ++++ { ++++ //fmod = numer - tquot * denom; tquot = numer/denom truncated ++++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); ++++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); ++++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); ++++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); ++++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); ++++ ++++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); ++++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); ++++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); ++++ _mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int); ++++ ++++ //d_very_early_code[i] = d_ca_code[associated_chip_index]; ++++ *d_very_early_code++ = d_ca_code[output[0]]; ++++ *d_very_early_code++ = d_ca_code[output[1]]; ++++ *d_very_early_code++ = d_ca_code[output[2]]; ++++ *d_very_early_code++ = d_ca_code[output[3]]; ++++ ++++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); ++++ } ++++ ++++ if (num_points%4!=0) ++++ { ++++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; ++++ _mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); ++++ ++++ int associated_chip_index; ++++ float tcode_half_chips = tcode_half_chips_stored[0]; ++++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; ++++ ++++ for (unsigned int i = 0; i < num_points%4; i++) ++++ { ++++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); ++++ d_very_early_code[i] = d_ca_code[associated_chip_index]; ++++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++++ } ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Takes the conjugate of a complex vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ ++++ ++++ float* pointer1 = (float*)&d_very_early_late_spc_chips; ++++ *pointer1 = 1; ++++ float* pointer2 = (float*)&code_length_half_chips; ++++ *pointer2 = 6; ++++ float* pointer3 = (float*)&code_phase_step_half_chips; ++++ *pointer3 = 7; ++++ float* pointer4 = (float*)&tcode_half_chips_input; ++++ *pointer4 = 8; ++++ ++++ int associated_chip_index; ++++ float tcode_half_chips = tcode_half_chips_input; ++++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; ++++ ++++ for (unsigned int i = 0; i < num_points; i++) ++++ { ++++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); ++++ d_very_early_code[i] = d_ca_code[associated_chip_index]; ++++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H */ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include ++++ /*! ++++ \brief Takes the conjugate of a complex vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ ++++ ++++ // float* pointer1 = (float*)&d_very_early_late_spc_chips; ++++ // *pointer1 = 1; ++++ // float* pointer2 = (float*)&code_length_half_chips; ++++ // *pointer2 = 6; ++++ // float* pointer3 = (float*)&code_phase_step_half_chips; ++++ // *pointer3 = 7; ++++ // float* pointer4 = (float*)&tcode_half_chips_input; ++++ // *pointer4 = 8; ++++ ++++ const unsigned int sse_iters = num_points / 4; ++++ ++++ __m128 tquot, fmod_num, fmod_result, associated_chip_index_array; ++++ ++++ __m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input); ++++ __m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4); ++++ __m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips); ++++ __m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips); ++++ __m128 twos = _mm_set1_ps (2); ++++ __m128i associated_chip_index_array_int; ++++ ++++ __VOLK_ATTR_ALIGNED(16) int32_t output[4]; ++++ ++++ for (unsigned int i = 0; i < sse_iters; i++) ++++ { ++++ //fmod = numer - tquot * denom; tquot = numer/denom truncated ++++ //associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips)); ++++ fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2); ++++ tquot = _mm_div_ps (fmod_num, code_length_half_chips_array); ++++ tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) ); ++++ fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array)); ++++ ++++ associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); ++++ associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array); ++++ associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array); ++++ _mm_store_si128 ((__m128i*)output, associated_chip_index_array_int); ++++ ++++ //d_very_early_code[i] = d_ca_code[associated_chip_index]; ++++ *d_very_early_code++ = d_ca_code[output[0]]; ++++ *d_very_early_code++ = d_ca_code[output[1]]; ++++ *d_very_early_code++ = d_ca_code[output[2]]; ++++ *d_very_early_code++ = d_ca_code[output[3]]; ++++ ++++ //tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++++ tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array); ++++ } ++++ ++++ if (num_points%4!=0) ++++ { ++++ __VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4]; ++++ _mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array); ++++ ++++ int associated_chip_index; ++++ float tcode_half_chips = tcode_half_chips_stored[0]; ++++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; ++++ ++++ for (unsigned int i = 0; i < num_points%4; i++) ++++ { ++++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); ++++ d_very_early_code[i] = d_ca_code[associated_chip_index]; ++++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++++ } ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Takes the conjugate of a complex vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of complex values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){ ++++ ++++ // float* pointer1 = (float*)&d_very_early_late_spc_chips; ++++ // *pointer1 = 1; ++++ // float* pointer2 = (float*)&code_length_half_chips; ++++ // *pointer2 = 6; ++++ // float* pointer3 = (float*)&code_phase_step_half_chips; ++++ // *pointer3 = 7; ++++ // float* pointer4 = (float*)&tcode_half_chips_input; ++++ // *pointer4 = 8; ++++ ++++ int associated_chip_index; ++++ float tcode_half_chips = tcode_half_chips_input; ++++ float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips; ++++ ++++ for (unsigned int i = 0; i < num_points; i++) ++++ { ++++ associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips)); ++++ d_very_early_code[i] = d_ca_code[associated_chip_index]; ++++ tcode_half_chips = tcode_half_chips + code_phase_step_half_chips; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_s32fc_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,178 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector The vector to be multiplied ++++ \param scalar The complex scalar to multiply aVector ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++*/ ++++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ __m128 x, yl, yh, z, tmp1, tmp2; ++++ lv_32fc_t* c = cVector; ++++ const lv_32fc_t* a = aVector; ++++ ++++ // Set up constant scalar vector ++++ yl = _mm_set_ps1(lv_creal(scalar)); ++++ yh = _mm_set_ps1(lv_cimag(scalar)); ++++ ++++ for(;number < halfPoints; number++){ ++++ ++++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container ++++ ++++ a += 2; ++++ c += 2; ++++ } ++++ ++++ if((num_points % 2) != 0) { ++++ *c = (*a) * scalar; ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector The vector to be multiplied ++++ \param scalar The complex scalar to multiply aVector ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++*/ ++++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++++ lv_32fc_t* cPtr = cVector; ++++ const lv_32fc_t* aPtr = aVector; ++++ unsigned int number = num_points; ++++ ++++ // unwrap loop ++++ while (number >= 8){ ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ number -= 8; ++++ } ++++ ++++ // clean up any remaining ++++ while (number-- > 0) ++++ *cPtr++ = *aPtr++ * scalar; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ /*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ __m128 x, yl, yh, z, tmp1, tmp2; ++++ lv_32fc_t* c = cVector; ++++ const lv_32fc_t* a = aVector; ++++ ++++ // Set up constant scalar vector ++++ yl = _mm_set_ps1(lv_creal(scalar)); ++++ yh = _mm_set_ps1(lv_cimag(scalar)); ++++ ++++ for(;number < halfPoints; number++){ ++++ ++++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ _mm_store_ps((float*)c,z); // Store the results back into the C container ++++ ++++ a += 2; ++++ c += 2; ++++ } ++++ ++++ if((num_points % 2) != 0) { ++++ *c = (*a) * scalar; ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){ ++++ lv_32fc_t* cPtr = cVector; ++++ const lv_32fc_t* aPtr = aVector; ++++ unsigned int number = num_points; ++++ ++++ // unwrap loop ++++ while (number >= 8){ ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ number -= 8; ++++ } ++++ ++++ // clean up any remaining ++++ while (number-- > 0) ++++ *cPtr++ = *aPtr++ * scalar; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_dot_prod_32fc.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,763 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ float * res = (float*) result; ++++ float * in = (float*) input; ++++ float * tp = (float*) taps; ++++ unsigned int n_2_ccomplex_blocks = num_points/2; ++++ unsigned int isodd = num_points & 1; ++++ ++++ float sum0[2] = {0,0}; ++++ float sum1[2] = {0,0}; ++++ unsigned int i = 0; ++++ ++++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++++ ++++ in += 4; ++++ tp += 4; ++++ } ++++ ++++ res[0] = sum0[0] + sum1[0]; ++++ res[1] = sum0[1] + sum1[1]; ++++ ++++ // Cleanup if we had an odd number of points ++++ for(i = 0; i < isodd; ++i) { ++++ *result += input[num_points - 1] * taps[num_points - 1]; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++ ++++ ++++#if LV_HAVE_SSE && LV_HAVE_64 ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ const unsigned int num_bytes = num_points*8; ++++ unsigned int isodd = num_points & 1; ++++ ++++ asm ++++ ( ++++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" ++++ "# const float *taps, unsigned num_bytes)\n\t" ++++ "# float sum0 = 0;\n\t" ++++ "# float sum1 = 0;\n\t" ++++ "# float sum2 = 0;\n\t" ++++ "# float sum3 = 0;\n\t" ++++ "# do {\n\t" ++++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" ++++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" ++++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" ++++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" ++++ "# input += 4;\n\t" ++++ "# taps += 4; \n\t" ++++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" ++++ "# result[0] = sum0 + sum2;\n\t" ++++ "# result[1] = sum1 + sum3;\n\t" ++++ "# TODO: prefetch and better scheduling\n\t" ++++ " xor %%r9, %%r9\n\t" ++++ " xor %%r10, %%r10\n\t" ++++ " movq %%rcx, %%rax\n\t" ++++ " movq %%rcx, %%r8\n\t" ++++ " movq %[rsi], %%r9\n\t" ++++ " movq %[rdx], %%r10\n\t" ++++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++++ " movups 0(%%r9), %%xmm0\n\t" ++++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++++ " movups 0(%%r10), %%xmm2\n\t" ++++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" ++++ " shr $4, %%r8\n\t" ++++ " jmp .%=L1_test\n\t" ++++ " # 4 taps / loop\n\t" ++++ " # something like ?? cycles / loop\n\t" ++++ ".%=Loop1: \n\t" ++++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++++ "# movups (%%r9), %%xmmA\n\t" ++++ "# movups (%%r10), %%xmmB\n\t" ++++ "# movups %%xmmA, %%xmmZ\n\t" ++++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++++ "# mulps %%xmmB, %%xmmA\n\t" ++++ "# mulps %%xmmZ, %%xmmB\n\t" ++++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++++ "# xorps %%xmmPN, %%xmmA\n\t" ++++ "# movups %%xmmA, %%xmmZ\n\t" ++++ "# unpcklps %%xmmB, %%xmmA\n\t" ++++ "# unpckhps %%xmmB, %%xmmZ\n\t" ++++ "# movups %%xmmZ, %%xmmY\n\t" ++++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++++ "# addps %%xmmZ, %%xmmA\n\t" ++++ "# addps %%xmmA, %%xmmC\n\t" ++++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++++ " movups 16(%%r9), %%xmm1\n\t" ++++ " movups %%xmm0, %%xmm4\n\t" ++++ " mulps %%xmm2, %%xmm0\n\t" ++++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++++ " movups 16(%%r10), %%xmm3\n\t" ++++ " movups %%xmm1, %%xmm5\n\t" ++++ " addps %%xmm0, %%xmm6\n\t" ++++ " mulps %%xmm3, %%xmm1\n\t" ++++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++++ " addps %%xmm1, %%xmm6\n\t" ++++ " mulps %%xmm4, %%xmm2\n\t" ++++ " movups 32(%%r9), %%xmm0\n\t" ++++ " addps %%xmm2, %%xmm7\n\t" ++++ " mulps %%xmm5, %%xmm3\n\t" ++++ " add $32, %%r9\n\t" ++++ " movups 32(%%r10), %%xmm2\n\t" ++++ " addps %%xmm3, %%xmm7\n\t" ++++ " add $32, %%r10\n\t" ++++ ".%=L1_test:\n\t" ++++ " dec %%rax\n\t" ++++ " jge .%=Loop1\n\t" ++++ " # We've handled the bulk of multiplies up to here.\n\t" ++++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++++ " # If so, we've got 2 more taps to do.\n\t" ++++ " and $1, %%r8\n\t" ++++ " je .%=Leven\n\t" ++++ " # The count was odd, do 2 more taps.\n\t" ++++ " # Note that we've already got mm0/mm2 preloaded\n\t" ++++ " # from the main loop.\n\t" ++++ " movups %%xmm0, %%xmm4\n\t" ++++ " mulps %%xmm2, %%xmm0\n\t" ++++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++++ " addps %%xmm0, %%xmm6\n\t" ++++ " mulps %%xmm4, %%xmm2\n\t" ++++ " addps %%xmm2, %%xmm7\n\t" ++++ ".%=Leven:\n\t" ++++ " # neg inversor\n\t" ++++ " xorps %%xmm1, %%xmm1\n\t" ++++ " mov $0x80000000, %%r9\n\t" ++++ " movd %%r9, %%xmm1\n\t" ++++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++++ " # pfpnacc\n\t" ++++ " xorps %%xmm1, %%xmm6\n\t" ++++ " movups %%xmm6, %%xmm2\n\t" ++++ " unpcklps %%xmm7, %%xmm6\n\t" ++++ " unpckhps %%xmm7, %%xmm2\n\t" ++++ " movups %%xmm2, %%xmm3\n\t" ++++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++++ " addps %%xmm2, %%xmm6\n\t" ++++ " # xmm6 = r1 i2 r3 i4\n\t" ++++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" ++++ : ++++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) ++++ :"rax", "r8", "r9", "r10" ++++ ); ++++ ++++ ++++ if(isodd) { ++++ *result += input[num_points - 1] * taps[num_points - 1]; ++++ } ++++ ++++ return; ++++ ++++} ++++ ++++#endif /* LV_HAVE_SSE && LV_HAVE_64 */ ++++ ++++ ++++ ++++ ++++#ifdef LV_HAVE_SSE3 ++++ ++++#include ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ lv_32fc_t dotProduct; ++++ memset(&dotProduct, 0x0, 2*sizeof(float)); ++++ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points/2; ++++ unsigned int isodd = num_points & 1; ++++ ++++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++++ ++++ const lv_32fc_t* a = input; ++++ const lv_32fc_t* b = taps; ++++ ++++ dotProdVal = _mm_setzero_ps(); ++++ ++++ for(;number < halfPoints; number++){ ++++ ++++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together ++++ ++++ a += 2; ++++ b += 2; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; ++++ ++++ _mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector ++++ ++++ dotProduct += ( dotProductVector[0] + dotProductVector[1] ); ++++ ++++ if(isodd) { ++++ dotProduct += input[num_points - 1] * taps[num_points - 1]; ++++ } ++++ ++++ *result = dotProduct; ++++} ++++ ++++#endif /*LV_HAVE_SSE3*/ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++ ++++#include ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ unsigned int i = 0; ++++ const unsigned int qtr_points = num_points/4; ++++ const unsigned int isodd = num_points & 3; ++++ ++++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; ++++ float *p_input, *p_taps; ++++ __m64 *p_result; ++++ ++++ p_result = (__m64*)result; ++++ p_input = (float*)input; ++++ p_taps = (float*)taps; ++++ ++++ static const __m128i neg = {0x000000000000000080000000}; ++++ ++++ real0 = _mm_setzero_ps(); ++++ real1 = _mm_setzero_ps(); ++++ im0 = _mm_setzero_ps(); ++++ im1 = _mm_setzero_ps(); ++++ ++++ for(; i < qtr_points; ++i) { ++++ xmm0 = _mm_loadu_ps(p_input); ++++ xmm1 = _mm_loadu_ps(p_taps); ++++ ++++ p_input += 4; ++++ p_taps += 4; ++++ ++++ xmm2 = _mm_loadu_ps(p_input); ++++ xmm3 = _mm_loadu_ps(p_taps); ++++ ++++ p_input += 4; ++++ p_taps += 4; ++++ ++++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2); ++++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3); ++++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2); ++++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3); ++++ ++++ //imaginary vector from input ++++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4); ++++ //real vector from input ++++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4); ++++ //imaginary vector from taps ++++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5); ++++ //real vector from taps ++++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5); ++++ ++++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); ++++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); ++++ ++++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); ++++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); ++++ ++++ real0 = _mm_add_ps(xmm4, real0); ++++ real1 = _mm_add_ps(xmm5, real1); ++++ im0 = _mm_add_ps(xmm6, im0); ++++ im1 = _mm_add_ps(xmm7, im1); ++++ } ++++ ++++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); ++++ ++++ im0 = _mm_add_ps(im0, im1); ++++ real0 = _mm_add_ps(real0, real1); ++++ ++++ im0 = _mm_add_ps(im0, real0); ++++ ++++ _mm_storel_pi(p_result, im0); ++++ ++++ for(i = num_points-isodd; i < num_points; i++) { ++++ *result += input[i] * taps[i]; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++ ++++ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ const unsigned int num_bytes = num_points*8; ++++ ++++ float * res = (float*) result; ++++ float * in = (float*) input; ++++ float * tp = (float*) taps; ++++ unsigned int n_2_ccomplex_blocks = num_bytes >> 4; ++++ unsigned int isodd = num_points & 1; ++++ ++++ float sum0[2] = {0,0}; ++++ float sum1[2] = {0,0}; ++++ unsigned int i = 0; ++++ ++++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++++ ++++ in += 4; ++++ tp += 4; ++++ } ++++ ++++ res[0] = sum0[0] + sum1[0]; ++++ res[1] = sum0[1] + sum1[1]; ++++ ++++ for(i = 0; i < isodd; ++i) { ++++ *result += input[num_points - 1] * taps[num_points - 1]; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++ ++++#if LV_HAVE_SSE && LV_HAVE_64 ++++ ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ const unsigned int num_bytes = num_points*8; ++++ unsigned int isodd = num_points & 1; ++++ ++++ asm ++++ ( ++++ "# ccomplex_dotprod_generic (float* result, const float *input,\n\t" ++++ "# const float *taps, unsigned num_bytes)\n\t" ++++ "# float sum0 = 0;\n\t" ++++ "# float sum1 = 0;\n\t" ++++ "# float sum2 = 0;\n\t" ++++ "# float sum3 = 0;\n\t" ++++ "# do {\n\t" ++++ "# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t" ++++ "# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t" ++++ "# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t" ++++ "# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t" ++++ "# input += 4;\n\t" ++++ "# taps += 4; \n\t" ++++ "# } while (--n_2_ccomplex_blocks != 0);\n\t" ++++ "# result[0] = sum0 + sum2;\n\t" ++++ "# result[1] = sum1 + sum3;\n\t" ++++ "# TODO: prefetch and better scheduling\n\t" ++++ " xor %%r9, %%r9\n\t" ++++ " xor %%r10, %%r10\n\t" ++++ " movq %%rcx, %%rax\n\t" ++++ " movq %%rcx, %%r8\n\t" ++++ " movq %[rsi], %%r9\n\t" ++++ " movq %[rdx], %%r10\n\t" ++++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++++ " movaps 0(%%r9), %%xmm0\n\t" ++++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++++ " movaps 0(%%r10), %%xmm2\n\t" ++++ " shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t" ++++ " shr $4, %%r8\n\t" ++++ " jmp .%=L1_test\n\t" ++++ " # 4 taps / loop\n\t" ++++ " # something like ?? cycles / loop\n\t" ++++ ".%=Loop1: \n\t" ++++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++++ "# movaps (%%r9), %%xmmA\n\t" ++++ "# movaps (%%r10), %%xmmB\n\t" ++++ "# movaps %%xmmA, %%xmmZ\n\t" ++++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++++ "# mulps %%xmmB, %%xmmA\n\t" ++++ "# mulps %%xmmZ, %%xmmB\n\t" ++++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++++ "# xorps %%xmmPN, %%xmmA\n\t" ++++ "# movaps %%xmmA, %%xmmZ\n\t" ++++ "# unpcklps %%xmmB, %%xmmA\n\t" ++++ "# unpckhps %%xmmB, %%xmmZ\n\t" ++++ "# movaps %%xmmZ, %%xmmY\n\t" ++++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++++ "# addps %%xmmZ, %%xmmA\n\t" ++++ "# addps %%xmmA, %%xmmC\n\t" ++++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++++ " movaps 16(%%r9), %%xmm1\n\t" ++++ " movaps %%xmm0, %%xmm4\n\t" ++++ " mulps %%xmm2, %%xmm0\n\t" ++++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++++ " movaps 16(%%r10), %%xmm3\n\t" ++++ " movaps %%xmm1, %%xmm5\n\t" ++++ " addps %%xmm0, %%xmm6\n\t" ++++ " mulps %%xmm3, %%xmm1\n\t" ++++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++++ " addps %%xmm1, %%xmm6\n\t" ++++ " mulps %%xmm4, %%xmm2\n\t" ++++ " movaps 32(%%r9), %%xmm0\n\t" ++++ " addps %%xmm2, %%xmm7\n\t" ++++ " mulps %%xmm5, %%xmm3\n\t" ++++ " add $32, %%r9\n\t" ++++ " movaps 32(%%r10), %%xmm2\n\t" ++++ " addps %%xmm3, %%xmm7\n\t" ++++ " add $32, %%r10\n\t" ++++ ".%=L1_test:\n\t" ++++ " dec %%rax\n\t" ++++ " jge .%=Loop1\n\t" ++++ " # We've handled the bulk of multiplies up to here.\n\t" ++++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++++ " # If so, we've got 2 more taps to do.\n\t" ++++ " and $1, %%r8\n\t" ++++ " je .%=Leven\n\t" ++++ " # The count was odd, do 2 more taps.\n\t" ++++ " # Note that we've already got mm0/mm2 preloaded\n\t" ++++ " # from the main loop.\n\t" ++++ " movaps %%xmm0, %%xmm4\n\t" ++++ " mulps %%xmm2, %%xmm0\n\t" ++++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++++ " addps %%xmm0, %%xmm6\n\t" ++++ " mulps %%xmm4, %%xmm2\n\t" ++++ " addps %%xmm2, %%xmm7\n\t" ++++ ".%=Leven:\n\t" ++++ " # neg inversor\n\t" ++++ " xorps %%xmm1, %%xmm1\n\t" ++++ " mov $0x80000000, %%r9\n\t" ++++ " movd %%r9, %%xmm1\n\t" ++++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++++ " # pfpnacc\n\t" ++++ " xorps %%xmm1, %%xmm6\n\t" ++++ " movaps %%xmm6, %%xmm2\n\t" ++++ " unpcklps %%xmm7, %%xmm6\n\t" ++++ " unpckhps %%xmm7, %%xmm2\n\t" ++++ " movaps %%xmm2, %%xmm3\n\t" ++++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++++ " addps %%xmm2, %%xmm6\n\t" ++++ " # xmm6 = r1 i2 r3 i4\n\t" ++++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++++ " movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t" ++++ : ++++ :[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result) ++++ :"rax", "r8", "r9", "r10" ++++ ); ++++ ++++ ++++ if(isodd) { ++++ *result += input[num_points - 1] * taps[num_points - 1]; ++++ } ++++ ++++ return; ++++ ++++} ++++ ++++#endif ++++ ++++#if LV_HAVE_SSE && LV_HAVE_32 ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points); ++++ ++++#if 0 ++++ const unsigned int num_bytes = num_points*8; ++++ unsigned int isodd = num_points & 1; ++++ ++++ asm volatile ++++ ( ++++ " #pushl %%ebp\n\t" ++++ " #movl %%esp, %%ebp\n\t" ++++ " movl 12(%%ebp), %%eax # input\n\t" ++++ " movl 16(%%ebp), %%edx # taps\n\t" ++++ " movl 20(%%ebp), %%ecx # n_bytes\n\t" ++++ " xorps %%xmm6, %%xmm6 # zero accumulators\n\t" ++++ " movaps 0(%%eax), %%xmm0\n\t" ++++ " xorps %%xmm7, %%xmm7 # zero accumulators\n\t" ++++ " movaps 0(%%edx), %%xmm2\n\t" ++++ " shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t" ++++ " jmp .%=L1_test\n\t" ++++ " # 4 taps / loop\n\t" ++++ " # something like ?? cycles / loop\n\t" ++++ ".%=Loop1: \n\t" ++++ "# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t" ++++ "# movaps (%%eax), %%xmmA\n\t" ++++ "# movaps (%%edx), %%xmmB\n\t" ++++ "# movaps %%xmmA, %%xmmZ\n\t" ++++ "# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t" ++++ "# mulps %%xmmB, %%xmmA\n\t" ++++ "# mulps %%xmmZ, %%xmmB\n\t" ++++ "# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t" ++++ "# xorps %%xmmPN, %%xmmA\n\t" ++++ "# movaps %%xmmA, %%xmmZ\n\t" ++++ "# unpcklps %%xmmB, %%xmmA\n\t" ++++ "# unpckhps %%xmmB, %%xmmZ\n\t" ++++ "# movaps %%xmmZ, %%xmmY\n\t" ++++ "# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t" ++++ "# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t" ++++ "# addps %%xmmZ, %%xmmA\n\t" ++++ "# addps %%xmmA, %%xmmC\n\t" ++++ "# A=xmm0, B=xmm2, Z=xmm4\n\t" ++++ "# A'=xmm1, B'=xmm3, Z'=xmm5\n\t" ++++ " movaps 16(%%eax), %%xmm1\n\t" ++++ " movaps %%xmm0, %%xmm4\n\t" ++++ " mulps %%xmm2, %%xmm0\n\t" ++++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++++ " movaps 16(%%edx), %%xmm3\n\t" ++++ " movaps %%xmm1, %%xmm5\n\t" ++++ " addps %%xmm0, %%xmm6\n\t" ++++ " mulps %%xmm3, %%xmm1\n\t" ++++ " shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t" ++++ " addps %%xmm1, %%xmm6\n\t" ++++ " mulps %%xmm4, %%xmm2\n\t" ++++ " movaps 32(%%eax), %%xmm0\n\t" ++++ " addps %%xmm2, %%xmm7\n\t" ++++ " mulps %%xmm5, %%xmm3\n\t" ++++ " addl $32, %%eax\n\t" ++++ " movaps 32(%%edx), %%xmm2\n\t" ++++ " addps %%xmm3, %%xmm7\n\t" ++++ " addl $32, %%edx\n\t" ++++ ".%=L1_test:\n\t" ++++ " decl %%ecx\n\t" ++++ " jge .%=Loop1\n\t" ++++ " # We've handled the bulk of multiplies up to here.\n\t" ++++ " # Let's sse if original n_2_ccomplex_blocks was odd.\n\t" ++++ " # If so, we've got 2 more taps to do.\n\t" ++++ " movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t" ++++ " shrl $4, %%ecx\n\t" ++++ " andl $1, %%ecx\n\t" ++++ " je .%=Leven\n\t" ++++ " # The count was odd, do 2 more taps.\n\t" ++++ " # Note that we've already got mm0/mm2 preloaded\n\t" ++++ " # from the main loop.\n\t" ++++ " movaps %%xmm0, %%xmm4\n\t" ++++ " mulps %%xmm2, %%xmm0\n\t" ++++ " shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t" ++++ " addps %%xmm0, %%xmm6\n\t" ++++ " mulps %%xmm4, %%xmm2\n\t" ++++ " addps %%xmm2, %%xmm7\n\t" ++++ ".%=Leven:\n\t" ++++ " # neg inversor\n\t" ++++ " movl 8(%%ebp), %%eax \n\t" ++++ " xorps %%xmm1, %%xmm1\n\t" ++++ " movl $0x80000000, (%%eax)\n\t" ++++ " movss (%%eax), %%xmm1\n\t" ++++ " shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t" ++++ " # pfpnacc\n\t" ++++ " xorps %%xmm1, %%xmm6\n\t" ++++ " movaps %%xmm6, %%xmm2\n\t" ++++ " unpcklps %%xmm7, %%xmm6\n\t" ++++ " unpckhps %%xmm7, %%xmm2\n\t" ++++ " movaps %%xmm2, %%xmm3\n\t" ++++ " shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t" ++++ " shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t" ++++ " addps %%xmm2, %%xmm6\n\t" ++++ " # xmm6 = r1 i2 r3 i4\n\t" ++++ " #movl 8(%%ebp), %%eax # @result\n\t" ++++ " movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t" ++++ " addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t" ++++ " movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t" ++++ " #popl %%ebp\n\t" ++++ : ++++ : ++++ : "eax", "ecx", "edx" ++++ ); ++++ ++++ ++++ int getem = num_bytes % 16; ++++ ++++ if(isodd) { ++++ *result += (input[num_points - 1] * taps[num_points - 1]); ++++ } ++++ ++++ return; ++++#endif ++++} ++++ ++++#endif /*LV_HAVE_SSE*/ ++++ ++++#ifdef LV_HAVE_SSE3 ++++ ++++#include ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ const unsigned int num_bytes = num_points*8; ++++ unsigned int isodd = num_points & 1; ++++ ++++ lv_32fc_t dotProduct; ++++ memset(&dotProduct, 0x0, 2*sizeof(float)); ++++ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_bytes >> 4; ++++ ++++ __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal; ++++ ++++ const lv_32fc_t* a = input; ++++ const lv_32fc_t* b = taps; ++++ ++++ dotProdVal = _mm_setzero_ps(); ++++ ++++ for(;number < halfPoints; number++){ ++++ ++++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together ++++ ++++ a += 2; ++++ b += 2; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; ++++ ++++ _mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector ++++ ++++ dotProduct += ( dotProductVector[0] + dotProductVector[1] ); ++++ ++++ if(isodd) { ++++ dotProduct += input[num_points - 1] * taps[num_points - 1]; ++++ } ++++ ++++ *result = dotProduct; ++++} ++++ ++++#endif /*LV_HAVE_SSE3*/ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++ ++++#include ++++ ++++static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) { ++++ ++++ unsigned int i = 0; ++++ const unsigned int qtr_points = num_points/4; ++++ const unsigned int isodd = num_points & 3; ++++ ++++ __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1; ++++ float *p_input, *p_taps; ++++ __m64 *p_result; ++++ ++++ static const __m128i neg = {0x000000000000000080000000}; ++++ ++++ p_result = (__m64*)result; ++++ p_input = (float*)input; ++++ p_taps = (float*)taps; ++++ ++++ real0 = _mm_setzero_ps(); ++++ real1 = _mm_setzero_ps(); ++++ im0 = _mm_setzero_ps(); ++++ im1 = _mm_setzero_ps(); ++++ ++++ for(; i < qtr_points; ++i) { ++++ xmm0 = _mm_load_ps(p_input); ++++ xmm1 = _mm_load_ps(p_taps); ++++ ++++ p_input += 4; ++++ p_taps += 4; ++++ ++++ xmm2 = _mm_load_ps(p_input); ++++ xmm3 = _mm_load_ps(p_taps); ++++ ++++ p_input += 4; ++++ p_taps += 4; ++++ ++++ xmm4 = _mm_unpackhi_ps(xmm0, xmm2); ++++ xmm5 = _mm_unpackhi_ps(xmm1, xmm3); ++++ xmm0 = _mm_unpacklo_ps(xmm0, xmm2); ++++ xmm2 = _mm_unpacklo_ps(xmm1, xmm3); ++++ ++++ //imaginary vector from input ++++ xmm1 = _mm_unpackhi_ps(xmm0, xmm4); ++++ //real vector from input ++++ xmm3 = _mm_unpacklo_ps(xmm0, xmm4); ++++ //imaginary vector from taps ++++ xmm0 = _mm_unpackhi_ps(xmm2, xmm5); ++++ //real vector from taps ++++ xmm2 = _mm_unpacklo_ps(xmm2, xmm5); ++++ ++++ xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1); ++++ xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1); ++++ ++++ xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2); ++++ xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2); ++++ ++++ real0 = _mm_add_ps(xmm4, real0); ++++ real1 = _mm_add_ps(xmm5, real1); ++++ im0 = _mm_add_ps(xmm6, im0); ++++ im1 = _mm_add_ps(xmm7, im1); ++++ } ++++ ++++ real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec); ++++ ++++ im0 = _mm_add_ps(im0, im1); ++++ real0 = _mm_add_ps(real0, real1); ++++ ++++ im0 = _mm_add_ps(im0, real0); ++++ ++++ _mm_storel_pi(p_result, im0); ++++ ++++ for(i = num_points-isodd; i < num_points; i++) { ++++ *result += input[i] * taps[i]; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_multiply_32fc.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,170 @@ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ /*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ __m128 x, y, yl, yh, z, tmp1, tmp2; ++++ lv_32fc_t* c = cVector; ++++ const lv_32fc_t* a = aVector; ++++ const lv_32fc_t* b = bVector; ++++ ++++ for(;number < halfPoints; number++){ ++++ ++++ x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ _mm_storeu_ps((float*)c,z); // Store the results back into the C container ++++ ++++ a += 2; ++++ b += 2; ++++ c += 2; ++++ } ++++ ++++ if((num_points % 2) != 0) { ++++ *c = (*a) * (*b); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++++ lv_32fc_t* cPtr = cVector; ++++ const lv_32fc_t* aPtr = aVector; ++++ const lv_32fc_t* bPtr= bVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) * (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ /*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ __m128 x, y, yl, yh, z, tmp1, tmp2; ++++ lv_32fc_t* c = cVector; ++++ const lv_32fc_t* a = aVector; ++++ const lv_32fc_t* b = bVector; ++++ for(;number < halfPoints; number++){ ++++ ++++ x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ _mm_store_ps((float*)c,z); // Store the results back into the C container ++++ ++++ a += 2; ++++ b += 2; ++++ c += 2; ++++ } ++++ ++++ if((num_points % 2) != 0) { ++++ *c = (*a) * (*b); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++ /*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++++ lv_32fc_t* cPtr = cVector; ++++ const lv_32fc_t* aPtr = aVector; ++++ const lv_32fc_t* bPtr= bVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) * (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++ /*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points); ++++static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){ ++++ volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++ ++++ ++++ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,409 @@ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++/*! ++++ * TODO: Code the SSE4 version and benchmark it ++++ */ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ ++++ ++++ /*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) ++++{ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ lv_32fc_t dotProduct_E; ++++ memset(&dotProduct_E, 0x0, 2*sizeof(float)); ++++ lv_32fc_t dotProduct_P; ++++ memset(&dotProduct_P, 0x0, 2*sizeof(float)); ++++ lv_32fc_t dotProduct_L; ++++ memset(&dotProduct_L, 0x0, 2*sizeof(float)); ++++ ++++ // Aux vars ++++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; ++++ ++++ z_E = _mm_setzero_ps(); ++++ z_P = _mm_setzero_ps(); ++++ z_L = _mm_setzero_ps(); ++++ ++++ //input and output vectors ++++ //lv_32fc_t* _input_BB = input_BB; ++++ const lv_32fc_t* _input = input; ++++ const lv_32fc_t* _carrier = carrier; ++++ const lv_32fc_t* _E_code = E_code; ++++ const lv_32fc_t* _P_code = P_code; ++++ const lv_32fc_t* _L_code = L_code; ++++ ++++ for(;number < halfPoints; number++) ++++ { ++++ // carrier wipe-off (vector point-to-point product) ++++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container ++++ ++++ // correlation E,P,L (3x vector scalar product) ++++ // Early ++++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ x = z; ++++ ++++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together ++++ ++++ // Prompt ++++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together ++++ ++++ // Late ++++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together ++++ ++++ /*pointer increment*/ ++++ _carrier += 2; ++++ _input += 2; ++++ //_input_BB += 2; ++++ _E_code += 2; ++++ _P_code += 2; ++++ _L_code +=2; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; ++++ ++++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++++ ++++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); ++++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); ++++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); ++++ ++++ if((num_points % 2) != 0) ++++ { ++++ //_input_BB = (*_input) * (*_carrier); ++++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); ++++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); ++++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); ++++ } ++++ ++++ *E_out = dotProduct_E; ++++ *P_out = dotProduct_P; ++++ *L_out = dotProduct_L; ++++} ++++ ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) ++++{ ++++ lv_32fc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get early, late, and prompt values for each ++++ *E_out += bb_signal_sample * E_code[i]; ++++ *P_out += bb_signal_sample * P_code[i]; ++++ *L_out += bb_signal_sample * L_code[i]; ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) ++++{ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ lv_32fc_t dotProduct_E; ++++ memset(&dotProduct_E, 0x0, 2*sizeof(float)); ++++ lv_32fc_t dotProduct_P; ++++ memset(&dotProduct_P, 0x0, 2*sizeof(float)); ++++ lv_32fc_t dotProduct_L; ++++ memset(&dotProduct_L, 0x0, 2*sizeof(float)); ++++ ++++ // Aux vars ++++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L; ++++ ++++ z_E = _mm_setzero_ps(); ++++ z_P = _mm_setzero_ps(); ++++ z_L = _mm_setzero_ps(); ++++ ++++ //input and output vectors ++++ //lv_32fc_t* _input_BB = input_BB; ++++ const lv_32fc_t* _input = input; ++++ const lv_32fc_t* _carrier = carrier; ++++ const lv_32fc_t* _E_code = E_code; ++++ const lv_32fc_t* _P_code = P_code; ++++ const lv_32fc_t* _L_code = L_code; ++++ ++++ for(;number < halfPoints; number++) ++++ { ++++ // carrier wipe-off (vector point-to-point product) ++++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ //_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container ++++ ++++ // correlation E,P,L (3x vector scalar product) ++++ // Early ++++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ x = z; ++++ ++++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together ++++ ++++ // Prompt ++++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together ++++ ++++ // Late ++++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ ++++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together ++++ ++++ /*pointer increment*/ ++++ _carrier += 2; ++++ _input += 2; ++++ //_input_BB += 2; ++++ _E_code += 2; ++++ _P_code += 2; ++++ _L_code +=2; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++++ //__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB; ++++ ++++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++++ ++++ dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] ); ++++ dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] ); ++++ dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] ); ++++ ++++ if((num_points % 2) != 0) ++++ { ++++ //_input_BB = (*_input) * (*_carrier); ++++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); ++++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); ++++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); ++++ } ++++ ++++ *E_out = dotProduct_E; ++++ *P_out = dotProduct_P; ++++ *L_out = dotProduct_L; ++++} ++++ ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points) ++++{ ++++ lv_32fc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get early, late, and prompt values for each ++++ *E_out += bb_signal_sample * E_code[i]; ++++ *P_out += bb_signal_sample * P_code[i]; ++++ *L_out += bb_signal_sample * L_code[i]; ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,848 @@ ++++/*! ++++ * \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5 ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors ++++ * \authors
    ++++ *
  • Javier Arribas, 2011. jarribas(at)cttc.es ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * VE, Early, Prompt, Late and VL correlation with 64 bits vectors (32 bits the ++++ * real part and 32 bits the imaginary part): ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 64 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - VE values are calculated by multiplying the input signal in BB by the ++++ * VE code (multiplication of 64 bits vectors), accumulating the results ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 64 bits vectors), accumulating the results ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 64 bits vectors), accumulating the results ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 64 bits vectors), accumulating the results ++++ * - VL values are calculated by multiplying the input signal in BB by the ++++ * VL code (multiplication of 64 bits vectors), accumulating the results ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code VE PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param VL_code VL PRN code replica input ++++ \param VE_out VE correlation output ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param VL_out VL correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++++{ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 4; ++++ ++++ lv_32fc_t dotProduct_VE; ++++ lv_32fc_t dotProduct_E; ++++ lv_32fc_t dotProduct_P; ++++ lv_32fc_t dotProduct_L; ++++ lv_32fc_t dotProduct_VL; ++++ ++++ // Aux vars ++++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; ++++ __m256 bb_signal_sample, bb_signal_sample_shuffled; ++++ ++++ z_VE = _mm256_setzero_ps(); ++++ z_E = _mm256_setzero_ps(); ++++ z_P = _mm256_setzero_ps(); ++++ z_L = _mm256_setzero_ps(); ++++ z_VL = _mm256_setzero_ps(); ++++ ++++ //input and output vectors ++++ const lv_32fc_t* _input = input; ++++ const lv_32fc_t* _carrier = carrier; ++++ const lv_32fc_t* _VE_code = VE_code; ++++ const lv_32fc_t* _E_code = E_code; ++++ const lv_32fc_t* _P_code = P_code; ++++ const lv_32fc_t* _L_code = L_code; ++++ const lv_32fc_t* _VL_code = VL_code; ++++ ++++ for(;number < halfPoints; number++) ++++ { ++++ // carrier wipe-off (vector point-to-point product) ++++ x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++++ ++++ // correlation VE,E,P,L,VL (5x vector scalar product) ++++ // VE ++++ y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together ++++ ++++ // Early ++++ y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together ++++ ++++ // Prompt ++++ y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together ++++ ++++ // Late ++++ y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together ++++ ++++ // VL ++++ y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together ++++ ++++ /*pointer increment*/ ++++ _carrier += 4; ++++ _input += 4; ++++ _VE_code += 4; ++++ _E_code += 4; ++++ _P_code += 4; ++++ _L_code += 4; ++++ _VL_code += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; ++++ ++++ _mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector ++++ _mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++++ _mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++++ _mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++++ _mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector ++++ ++++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); ++++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); ++++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); ++++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); ++++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); ++++ ++++ for (int i = 0; i<(num_points % 4); ++i) ++++ { ++++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); ++++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); ++++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); ++++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); ++++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); ++++ } ++++ ++++ *VE_out = dotProduct_VE; ++++ *E_out = dotProduct_E; ++++ *P_out = dotProduct_P; ++++ *L_out = dotProduct_L; ++++ *VL_out = dotProduct_VL; ++++} ++++#endif /* LV_HAVE_AVX */ ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++ /*! ++++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code VE PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param VL_code VL PRN code replica input ++++ \param VE_out VE correlation output ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param VL_out VL correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++++{ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ lv_32fc_t dotProduct_VE; ++++ lv_32fc_t dotProduct_E; ++++ lv_32fc_t dotProduct_P; ++++ lv_32fc_t dotProduct_L; ++++ lv_32fc_t dotProduct_VL; ++++ ++++ // Aux vars ++++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; ++++ __m128 bb_signal_sample, bb_signal_sample_shuffled; ++++ ++++ z_VE = _mm_setzero_ps(); ++++ z_E = _mm_setzero_ps(); ++++ z_P = _mm_setzero_ps(); ++++ z_L = _mm_setzero_ps(); ++++ z_VL = _mm_setzero_ps(); ++++ ++++ //input and output vectors ++++ const lv_32fc_t* _input = input; ++++ const lv_32fc_t* _carrier = carrier; ++++ const lv_32fc_t* _VE_code = VE_code; ++++ const lv_32fc_t* _E_code = E_code; ++++ const lv_32fc_t* _P_code = P_code; ++++ const lv_32fc_t* _L_code = L_code; ++++ const lv_32fc_t* _VL_code = VL_code; ++++ ++++ for(;number < halfPoints; number++) ++++ { ++++ // carrier wipe-off (vector point-to-point product) ++++ x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++++ ++++ // correlation VE,E,P,L,VL (5x vector scalar product) ++++ // VE ++++ y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together ++++ ++++ // Early ++++ y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together ++++ ++++ // Prompt ++++ y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together ++++ ++++ // Late ++++ y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together ++++ ++++ // VL ++++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together ++++ ++++ /*pointer increment*/ ++++ _carrier += 2; ++++ _input += 2; ++++ _VE_code += 2; ++++ _E_code += 2; ++++ _P_code += 2; ++++ _L_code +=2; ++++ _VL_code +=2; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; ++++ ++++ _mm_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector ++++ ++++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); ++++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); ++++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); ++++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); ++++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); ++++ ++++ if((num_points % 2) != 0) ++++ { ++++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); ++++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); ++++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); ++++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); ++++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); ++++ } ++++ ++++ *VE_out = dotProduct_VE; ++++ *E_out = dotProduct_E; ++++ *P_out = dotProduct_P; ++++ *L_out = dotProduct_L; ++++ *VL_out = dotProduct_VL; ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code VE PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param VL_code VL PRN code replica input ++++ \param VE_out VE correlation output ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param VL_out VL correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++++{ ++++ lv_32fc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get early, late, and prompt values for each ++++ *VE_out += bb_signal_sample * VE_code[i]; ++++ *E_out += bb_signal_sample * E_code[i]; ++++ *P_out += bb_signal_sample * P_code[i]; ++++ *L_out += bb_signal_sample * L_code[i]; ++++ *VL_out += bb_signal_sample * VL_code[i]; ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code VE PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param VL_code VL PRN code replica input ++++ \param VE_out VE correlation output ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param VL_out VL correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++++{ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 4; ++++ ++++ lv_32fc_t dotProduct_VE; ++++ lv_32fc_t dotProduct_E; ++++ lv_32fc_t dotProduct_P; ++++ lv_32fc_t dotProduct_L; ++++ lv_32fc_t dotProduct_VL; ++++ ++++ // Aux vars ++++ __m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; ++++ __m256 bb_signal_sample, bb_signal_sample_shuffled; ++++ ++++ z_VE = _mm256_setzero_ps(); ++++ z_E = _mm256_setzero_ps(); ++++ z_P = _mm256_setzero_ps(); ++++ z_L = _mm256_setzero_ps(); ++++ z_VL = _mm256_setzero_ps(); ++++ ++++ //input and output vectors ++++ const lv_32fc_t* _input = input; ++++ const lv_32fc_t* _carrier = carrier; ++++ const lv_32fc_t* _VE_code = VE_code; ++++ const lv_32fc_t* _E_code = E_code; ++++ const lv_32fc_t* _P_code = P_code; ++++ const lv_32fc_t* _L_code = L_code; ++++ const lv_32fc_t* _VL_code = VL_code; ++++ ++++ for(;number < halfPoints; number++) ++++ { ++++ // carrier wipe-off (vector point-to-point product) ++++ x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++++ ++++ // correlation VE,E,P,L,VL (5x vector scalar product) ++++ // VE ++++ y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together ++++ ++++ // Early ++++ y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together ++++ ++++ // Prompt ++++ y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together ++++ ++++ // Late ++++ y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together ++++ ++++ // VL ++++ y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together ++++ ++++ /*pointer increment*/ ++++ _carrier += 4; ++++ _input += 4; ++++ _VE_code += 4; ++++ _E_code += 4; ++++ _P_code += 4; ++++ _L_code += 4; ++++ _VL_code += 4; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4]; ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4]; ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4]; ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4]; ++++ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4]; ++++ ++++ _mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector ++++ _mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++++ _mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++++ _mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++++ _mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector ++++ ++++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] ); ++++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] ); ++++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] ); ++++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] ); ++++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] ); ++++ ++++ for (int i = 0; i<(num_points % 4); ++i) ++++ { ++++ dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier); ++++ dotProduct_E += (*_input) * (*_E_code++) * (*_carrier); ++++ dotProduct_P += (*_input) * (*_P_code++) * (*_carrier); ++++ dotProduct_L += (*_input) * (*_L_code++) * (*_carrier); ++++ dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++); ++++ } ++++ ++++ *VE_out = dotProduct_VE; ++++ *E_out = dotProduct_E; ++++ *P_out = dotProduct_P; ++++ *L_out = dotProduct_L; ++++ *VL_out = dotProduct_VL; ++++} ++++#endif /* LV_HAVE_AVX */ ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code VE PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param VL_code VL PRN code replica input ++++ \param VE_out VE correlation output ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param VL_out VL correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++++{ ++++ unsigned int number = 0; ++++ const unsigned int halfPoints = num_points / 2; ++++ ++++ lv_32fc_t dotProduct_VE; ++++ lv_32fc_t dotProduct_E; ++++ lv_32fc_t dotProduct_P; ++++ lv_32fc_t dotProduct_L; ++++ lv_32fc_t dotProduct_VL; ++++ ++++ // Aux vars ++++ __m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL; ++++ __m128 bb_signal_sample, bb_signal_sample_shuffled; ++++ ++++ z_VE = _mm_setzero_ps(); ++++ z_E = _mm_setzero_ps(); ++++ z_P = _mm_setzero_ps(); ++++ z_L = _mm_setzero_ps(); ++++ z_VL = _mm_setzero_ps(); ++++ ++++ //input and output vectors ++++ const lv_32fc_t* _input = input; ++++ const lv_32fc_t* _carrier = carrier; ++++ const lv_32fc_t* _VE_code = VE_code; ++++ const lv_32fc_t* _E_code = E_code; ++++ const lv_32fc_t* _P_code = P_code; ++++ const lv_32fc_t* _L_code = L_code; ++++ const lv_32fc_t* _VL_code = VL_code; ++++ ++++ for(;number < halfPoints; number++) ++++ { ++++ // carrier wipe-off (vector point-to-point product) ++++ x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ ++++ x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ++++ ++++ tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br ++++ ++++ // correlation VE,E,P,L,VL (5x vector scalar product) ++++ // VE ++++ y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together ++++ ++++ // Early ++++ y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together ++++ ++++ // Prompt ++++ y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together ++++ ++++ // Late ++++ y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together ++++ ++++ // VL ++++ //x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi ++++ y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di ++++ ++++ yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr ++++ yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di ++++ ++++ tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ++++ tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ++++ ++++ z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di ++++ z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together ++++ ++++ /*pointer increment*/ ++++ _carrier += 2; ++++ _input += 2; ++++ _VE_code += 2; ++++ _E_code += 2; ++++ _P_code += 2; ++++ _L_code +=2; ++++ _VL_code +=2; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2]; ++++ ++++ _mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector ++++ ++++ dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] ); ++++ dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] ); ++++ dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] ); ++++ dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] ); ++++ dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] ); ++++ ++++ if((num_points % 2) != 0) ++++ { ++++ dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier); ++++ dotProduct_E += (*_input) * (*_E_code)*(*_carrier); ++++ dotProduct_P += (*_input) * (*_P_code)*(*_carrier); ++++ dotProduct_L += (*_input) * (*_L_code)*(*_carrier); ++++ dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier); ++++ } ++++ ++++ *VE_out = dotProduct_VE; ++++ *E_out = dotProduct_E; ++++ *P_out = dotProduct_P; ++++ *L_out = dotProduct_L; ++++ *VL_out = dotProduct_VL; ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code VE PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param VL_code VL PRN code replica input ++++ \param VE_out VE correlation output ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param VL_out VL correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points) ++++{ ++++ lv_32fc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get early, late, and prompt values for each ++++ *VE_out += bb_signal_sample * VE_code[i]; ++++ *E_out += bb_signal_sample * E_code[i]; ++++ *P_out += bb_signal_sample * P_code[i]; ++++ *L_out += bb_signal_sample * L_code[i]; ++++ *VL_out += bb_signal_sample * VL_code[i]; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,243 @@ ++++/*! ++++ * \file volk_gnsssdr_64f_accumulator_64f.h ++++ * \brief Volk protokernel: 64 bits (double) scalar accumulator ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that implements an accumulator of char values ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H ++++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){ ++++ double returnValue = 0; ++++ const unsigned int sse_iters = num_points / 4; ++++ ++++ const double* aPtr = inputBuffer; ++++ ++++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; ++++ __m256d accumulator = _mm256_setzero_pd(); ++++ __m256d aVal = _mm256_setzero_pd(); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ aVal = _mm256_loadu_pd(aPtr); ++++ accumulator = _mm256_add_pd(accumulator, aVal); ++++ aPtr += 4; ++++ } ++++ ++++ _mm256_storeu_pd((double*)tempBuffer,accumulator); ++++ ++++ for(int i = 0; i<4; ++i){ ++++ returnValue += tempBuffer[i]; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 4); ++i){ ++++ returnValue += (*aPtr++); ++++ } ++++ ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_AVX */ ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){ ++++ double returnValue = 0; ++++ const unsigned int sse_iters = num_points / 2; ++++ ++++ const double* aPtr = inputBuffer; ++++ ++++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; ++++ __m128d accumulator = _mm_setzero_pd(); ++++ __m128d aVal = _mm_setzero_pd(); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ aVal = _mm_loadu_pd(aPtr); ++++ accumulator = _mm_add_pd(accumulator, aVal); ++++ aPtr += 2; ++++ } ++++ ++++ _mm_storeu_pd((double*)tempBuffer,accumulator); ++++ ++++ for(int i = 0; i<2; ++i){ ++++ returnValue += tempBuffer[i]; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 2); ++i){ ++++ returnValue += (*aPtr++); ++++ } ++++ ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){ ++++ const double* aPtr = inputBuffer; ++++ double returnValue = 0; ++++ ++++ for(unsigned int number = 0;number < num_points; number++){ ++++ returnValue += (*aPtr++); ++++ } ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H ++++#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){ ++++ double returnValue = 0; ++++ const unsigned int sse_iters = num_points / 4; ++++ ++++ const double* aPtr = inputBuffer; ++++ ++++ __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; ++++ __m256d accumulator = _mm256_setzero_pd(); ++++ __m256d aVal = _mm256_setzero_pd(); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ aVal = _mm256_load_pd(aPtr); ++++ accumulator = _mm256_add_pd(accumulator, aVal); ++++ aPtr += 4; ++++ } ++++ ++++ _mm256_store_pd((double*)tempBuffer,accumulator); ++++ ++++ for(int i = 0; i<4; ++i){ ++++ returnValue += tempBuffer[i]; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 4); ++i){ ++++ returnValue += (*aPtr++); ++++ } ++++ ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_AVX */ ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){ ++++ double returnValue = 0; ++++ const unsigned int sse_iters = num_points / 2; ++++ ++++ const double* aPtr = inputBuffer; ++++ ++++ __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; ++++ __m128d accumulator = _mm_setzero_pd(); ++++ __m128d aVal = _mm_setzero_pd(); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ aVal = _mm_load_pd(aPtr); ++++ accumulator = _mm_add_pd(accumulator, aVal); ++++ aPtr += 2; ++++ } ++++ ++++ _mm_store_pd((double*)tempBuffer,accumulator); ++++ ++++ for(int i = 0; i<2; ++i){ ++++ returnValue += tempBuffer[i]; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 2); ++i){ ++++ returnValue += (*aPtr++); ++++ } ++++ ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){ ++++ const double* aPtr = inputBuffer; ++++ double returnValue = 0; ++++ ++++ for(unsigned int number = 0;number < num_points; number++){ ++++ returnValue += (*aPtr++); ++++ } ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */ +++\ No newline at end of file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,183 @@ ++++/*! ++++ * \file volk_gnsssdr_8i_accumulator_s8i.h ++++ * \brief Volk protokernel: 8 bits (char) scalar accumulator ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that implements an accumulator of char values ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H ++++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){ ++++ char returnValue = 0; ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ const char* aPtr = inputBuffer; ++++ ++++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; ++++ __m128i accumulator = _mm_setzero_si128(); ++++ __m128i aVal = _mm_setzero_si128(); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++){ ++++ aVal = _mm_lddqu_si128((__m128i*)aPtr); ++++ accumulator = _mm_add_epi8(accumulator, aVal); ++++ aPtr += 16; ++++ } ++++ _mm_storeu_si128((__m128i*)tempBuffer,accumulator); ++++ ++++ for(int i = 0; i<16; ++i){ ++++ returnValue += tempBuffer[i]; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i){ ++++ returnValue += (*aPtr++); ++++ } ++++ ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){ ++++ const char* aPtr = inputBuffer; ++++ char returnValue = 0; ++++ ++++ for(unsigned int number = 0;number < num_points; number++){ ++++ returnValue += (*aPtr++); ++++ } ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H ++++#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){ ++++ char returnValue = 0; ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ const char* aPtr = inputBuffer; ++++ ++++ __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; ++++ __m128i accumulator = _mm_setzero_si128(); ++++ __m128i aVal = _mm_setzero_si128(); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++){ ++++ aVal = _mm_load_si128((__m128i*)aPtr); ++++ accumulator = _mm_add_epi8(accumulator, aVal); ++++ aPtr += 16; ++++ } ++++ _mm_store_si128((__m128i*)tempBuffer,accumulator); ++++ ++++ for(int i = 0; i<16; ++i){ ++++ returnValue += tempBuffer[i]; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i){ ++++ returnValue += (*aPtr++); ++++ } ++++ ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){ ++++ const char* aPtr = inputBuffer; ++++ char returnValue = 0; ++++ ++++ for(unsigned int number = 0;number < num_points; number++){ ++++ returnValue += (*aPtr++); ++++ } ++++ *result = returnValue; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points); ++++static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){ ++++ ++++ short res = 0; ++++ char* resc = (char*)&res; ++++ resc++; ++++ ++++ volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points); ++++ ++++ *result = *resc; ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H */ ++++ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,493 @@ ++++/*! ++++ * \file volk_gnsssdr_8i_index_max_16u.h ++++ * \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H ++++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include "immintrin.h" ++++/*! ++++ \brief Returns the index of the max value in src0 ++++ \param target The index of the max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 32; ++++ ++++ char* basePtr = (char*)src0; ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ unsigned int index = 0; ++++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; ++++ __m256i ones, compareResults, currentValues; ++++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi; ++++ ++++ ones = _mm256_set1_epi8(0xFF); ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); ++++ ++++ lo = _mm256_castsi256_si128(currentValues); ++++ hi = _mm256_extractf128_si256(currentValues,1); ++++ ++++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo); ++++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi); ++++ ++++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h ++++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); ++++ ++++ if (!_mm256_testc_si256(compareResults, ones)) ++++ { ++++ _mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues); ++++ ++++ for(int i = 0; i < 32; i++) ++++ { ++++ if(currentValuesBuffer[i] > max) ++++ { ++++ index = inputPtr - basePtr + i; ++++ max = currentValuesBuffer[i]; ++++ } ++++ } ++++ maxValues = _mm_set1_epi8(max); ++++ } ++++ ++++ inputPtr += 32; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 32); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ index = i; ++++ max = src0[i]; ++++ } ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_AVX*/ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include ++++/*! ++++ \brief Returns the index of the max value in src0 ++++ \param target The index of the max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* basePtr = (char*)src0; ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ unsigned int index = 0; ++++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++++ __m128i maxValues, compareResults, currentValues; ++++ ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm_lddqu_si128((__m128i*)inputPtr); ++++ ++++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++++ ++++ if (!_mm_test_all_ones(compareResults)) ++++ { ++++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); ++++ ++++ for(int i = 0; i < 16; i++) ++++ { ++++ if(currentValuesBuffer[i] > max) ++++ { ++++ index = inputPtr - basePtr + i; ++++ max = currentValuesBuffer[i]; ++++ } ++++ } ++++ maxValues = _mm_set1_epi8(max); ++++ } ++++ ++++ inputPtr += 16; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ index = i; ++++ max = src0[i]; ++++ } ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Returns the index of the max value in src0 ++++ \param target The index of the max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* basePtr = (char*)src0; ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ unsigned int index = 0; ++++ unsigned short mask; ++++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++++ __m128i maxValues, compareResults, currentValues; ++++ ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); ++++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++++ mask = _mm_movemask_epi8(compareResults); ++++ ++++ if (mask != 0xFFFF) ++++ { ++++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); ++++ mask = ~mask; ++++ int i = 0; ++++ while (mask > 0) ++++ { ++++ if ((mask & 1) == 1) ++++ { ++++ if(currentValuesBuffer[i] > max) ++++ { ++++ index = inputPtr - basePtr + i; ++++ max = currentValuesBuffer[i]; ++++ } ++++ } ++++ i++; ++++ mask >>= 1; ++++ } ++++ maxValues = _mm_set1_epi8(max); ++++ } ++++ inputPtr += 16; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ index = i; ++++ max = src0[i]; ++++ } ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE2*/ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Returns the index of the max value in src0 ++++ \param target The index of the max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) { ++++ ++++ if(num_points > 0) ++++ { ++++ char max = src0[0]; ++++ unsigned int index = 0; ++++ ++++ for(unsigned int i = 1; i < num_points; ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ index = i; ++++ max = src0[i]; ++++ } ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H*/ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H ++++#define INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include "immintrin.h" ++++/*! ++++ \brief Returns the index of the max value in src0 ++++ \param target The index of the max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 32; ++++ ++++ char* basePtr = (char*)src0; ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ unsigned int index = 0; ++++ __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; ++++ __m256i ones, compareResults, currentValues; ++++ __m128i compareResultslo, compareResultshi, maxValues, lo, hi; ++++ ++++ ones = _mm256_set1_epi8(0xFF); ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm256_load_si256((__m256i*)inputPtr); ++++ ++++ lo = _mm256_castsi256_si128(currentValues); ++++ hi = _mm256_extractf128_si256(currentValues,1); ++++ ++++ compareResultslo = _mm_cmpgt_epi8(maxValues, lo); ++++ compareResultshi = _mm_cmpgt_epi8(maxValues, hi); ++++ ++++ //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h ++++ compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); ++++ ++++ if (!_mm256_testc_si256(compareResults, ones)) ++++ { ++++ _mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues); ++++ ++++ for(int i = 0; i < 32; i++) ++++ { ++++ if(currentValuesBuffer[i] > max) ++++ { ++++ index = inputPtr - basePtr + i; ++++ max = currentValuesBuffer[i]; ++++ } ++++ } ++++ maxValues = _mm_set1_epi8(max); ++++ } ++++ ++++ inputPtr += 32; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 32); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ index = i; ++++ max = src0[i]; ++++ } ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_AVX*/ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "emmintrin.h" ++++/*! ++++ \brief Returns the index of the max value in src0 ++++ \param target The index of the max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* basePtr = (char*)src0; ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ unsigned int index = 0; ++++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++++ __m128i maxValues, compareResults, currentValues; ++++ ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm_load_si128((__m128i*)inputPtr); ++++ ++++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++++ ++++ if (!_mm_test_all_ones(compareResults)) ++++ { ++++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); ++++ ++++ for(int i = 0; i < 16; i++) ++++ { ++++ if(currentValuesBuffer[i] > max) ++++ { ++++ index = inputPtr - basePtr + i; ++++ max = currentValuesBuffer[i]; ++++ } ++++ } ++++ maxValues = _mm_set1_epi8(max); ++++ } ++++ ++++ inputPtr += 16; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ index = i; ++++ max = src0[i]; ++++ } ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++/*! ++++ \brief Returns the index of the max value in src0 ++++ \param target The index of the max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* basePtr = (char*)src0; ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ unsigned int index = 0; ++++ unsigned short mask; ++++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++++ __m128i maxValues, compareResults, currentValues; ++++ ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm_load_si128((__m128i*)inputPtr); ++++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++++ mask = _mm_movemask_epi8(compareResults); ++++ ++++ if (mask != 0xFFFF) ++++ { ++++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); ++++ mask = ~mask; ++++ int i = 0; ++++ while (mask > 0) ++++ { ++++ if ((mask & 1) == 1) ++++ { ++++ if(currentValuesBuffer[i] > max) ++++ { ++++ index = inputPtr - basePtr + i; ++++ max = currentValuesBuffer[i]; ++++ } ++++ } ++++ i++; ++++ mask >>= 1; ++++ } ++++ maxValues = _mm_set1_epi8(max); ++++ } ++++ inputPtr += 16; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ index = i; ++++ max = src0[i]; ++++ } ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE2*/ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Returns the index of the max value in src0 ++++ \param target The index of the max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_index_max_16u_a_generic(unsigned int* target, const char* src0, unsigned int num_points) { ++++ ++++ if(num_points > 0) ++++ { ++++ char max = src0[0]; ++++ unsigned int index = 0; ++++ ++++ for(unsigned int i = 1; i < num_points; ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ index = i; ++++ max = src0[i]; ++++ } ++++ } ++++ target[0] = index; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H*/ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,327 @@ ++++/*! ++++ * \file volk_gnsssdr_8i_max_s8i.h ++++ * \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_u_H ++++#define INCLUDED_volk_gnsssdr_8i_max_s8i_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include ++++/*! ++++ \brief Returns the max value in src0 ++++ \param target The max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; ++++ __m128i maxValues, compareResults, currentValues; ++++ ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); ++++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); ++++ inputPtr += 16; ++++ } ++++ ++++ _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); ++++ ++++ for(int i = 0; i<16; ++i) ++++ { ++++ if(maxValuesBuffer[i] > max) ++++ { ++++ max = maxValuesBuffer[i]; ++++ } ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ max = src0[i]; ++++ } ++++ } ++++ target = max; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Returns the max value in src0 ++++ \param target The max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ unsigned short mask; ++++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++++ __m128i maxValues, compareResults, currentValues; ++++ ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm_loadu_si128((__m128i*)inputPtr); ++++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++++ mask = _mm_movemask_epi8(compareResults); ++++ ++++ if (mask != 0xFFFF) ++++ { ++++ _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); ++++ mask = ~mask; ++++ int i = 0; ++++ while (mask > 0) ++++ { ++++ if ((mask & 1) == 1) ++++ { ++++ if(currentValuesBuffer[i] > max) ++++ { ++++ max = currentValuesBuffer[i]; ++++ } ++++ } ++++ i++; ++++ mask >>= 1; ++++ } ++++ maxValues = _mm_set1_epi8(max); ++++ } ++++ inputPtr += 16; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ max = src0[i]; ++++ } ++++ } ++++ target = max; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE2*/ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Returns the max value in src0 ++++ \param target The max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0) ++++ { ++++ char max = src0[0]; ++++ ++++ for(unsigned int i = 1; i < num_points; ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ max = src0[i]; ++++ } ++++ } ++++ target = max; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_u_H*/ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_a_H ++++#define INCLUDED_volk_gnsssdr_8i_max_s8i_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Returns the max value in src0 ++++ \param target The max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; ++++ __m128i maxValues, compareResults, currentValues; ++++ ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm_load_si128((__m128i*)inputPtr); ++++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++++ maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); ++++ inputPtr += 16; ++++ } ++++ ++++ _mm_store_si128((__m128i*)maxValuesBuffer, maxValues); ++++ ++++ for(int i = 0; i<16; ++i) ++++ { ++++ if(maxValuesBuffer[i] > max) ++++ { ++++ max = maxValuesBuffer[i]; ++++ } ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ max = src0[i]; ++++ } ++++ } ++++ target = max; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++/*! ++++ \brief Returns the max value in src0 ++++ \param target The max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* inputPtr = (char*)src0; ++++ char max = src0[0]; ++++ unsigned short mask; ++++ __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; ++++ __m128i maxValues, compareResults, currentValues; ++++ ++++ maxValues = _mm_set1_epi8(max); ++++ ++++ for(unsigned int number = 0; number < sse_iters; number++) ++++ { ++++ currentValues = _mm_load_si128((__m128i*)inputPtr); ++++ compareResults = _mm_cmpgt_epi8(maxValues, currentValues); ++++ mask = _mm_movemask_epi8(compareResults); ++++ ++++ if (mask != 0xFFFF) ++++ { ++++ _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); ++++ mask = ~mask; ++++ int i = 0; ++++ while (mask > 0) ++++ { ++++ if ((mask & 1) == 1) ++++ { ++++ if(currentValuesBuffer[i] > max) ++++ { ++++ max = currentValuesBuffer[i]; ++++ } ++++ } ++++ i++; ++++ mask >>= 1; ++++ } ++++ maxValues = _mm_set1_epi8(max); ++++ } ++++ inputPtr += 16; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ max = src0[i]; ++++ } ++++ } ++++ target = max; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_SSE2*/ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Returns the max value in src0 ++++ \param target The max value in src0 ++++ \param src0 The buffer of data to be analysed ++++ \param num_points The number of values in src0 to be analysed ++++ */ ++++static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) { ++++ if(num_points > 0) ++++ { ++++ if(num_points > 0) ++++ { ++++ char max = src0[0]; ++++ ++++ for(unsigned int i = 1; i < num_points; ++i) ++++ { ++++ if(src0[i] > max) ++++ { ++++ max = src0[i]; ++++ } ++++ } ++++ target = max; ++++ } ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_a_H*/ +++\ No newline at end of file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,184 @@ ++++/*! ++++ * \file volk_gnsssdr_8i_x2_add_8i.h ++++ * \brief Volk protokernel: adds pairs of 8 bits (char) scalars ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that adds pairs of 8 bits (char) scalars ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H ++++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H ++++ ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "pmmintrin.h" ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* cPtr = cVector; ++++ const char* aPtr = aVector; ++++ const char* bPtr= bVector; ++++ ++++ __m128i aVal, bVal, cVal; ++++ ++++ for(int number = 0; number < sse_iters; number++){ ++++ ++++ aVal = _mm_lddqu_si128((__m128i*)aPtr); ++++ bVal = _mm_lddqu_si128((__m128i*)bPtr); ++++ ++++ cVal = _mm_add_epi8(aVal, bVal); ++++ ++++ _mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container ++++ ++++ aPtr += 16; ++++ bPtr += 16; ++++ cPtr += 16; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ *cPtr++ = (*aPtr++) + (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++++ char* cPtr = cVector; ++++ const char* aPtr = aVector; ++++ const char* bPtr= bVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) + (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H ++++#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H ++++ ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "pmmintrin.h" ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ char* cPtr = cVector; ++++ const char* aPtr = aVector; ++++ const char* bPtr= bVector; ++++ ++++ __m128i aVal, bVal, cVal; ++++ ++++ for(int number = 0; number < sse_iters; number++){ ++++ ++++ aVal = _mm_load_si128((__m128i*)aPtr); ++++ bVal = _mm_load_si128((__m128i*)bPtr); ++++ ++++ cVal = _mm_add_epi8(aVal, bVal); ++++ ++++ _mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container ++++ ++++ aPtr += 16; ++++ bPtr += 16; ++++ cPtr += 16; ++++ } ++++ ++++ for(int i = 0; i<(num_points % 16); ++i) ++++ { ++++ *cPtr++ = (*aPtr++) + (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++++ char* cPtr = cVector; ++++ const char* aPtr = aVector; ++++ const char* bPtr= bVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) + (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Adds the two input vectors and store their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be added ++++ \param bVector One of the vectors to be added ++++ \param num_points The number of values in aVector and bVector to be added together and stored into cVector ++++ */ ++++extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points); ++++static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){ ++++ volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,326 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_conjugate_8ic.h ++++ * \brief Volk protokernel: calculates the conjugate of a 16 bits vector ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that calculates the conjugate of a ++++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part) ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H ++++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include "immintrin.h" ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ ++++ __m256 tmp; ++++ __m128i tmp128lo, tmp128hi; ++++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); ++++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); ++++ ++++ for (int i = 0; i < sse_iters; ++i) ++++ { ++++ tmp = _mm256_loadu_ps((float*)a); ++++ tmp = _mm256_xor_ps(tmp, conjugator1); ++++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); ++++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); ++++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); ++++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); ++++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h ++++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1); ++++ _mm256_storeu_ps((float*)c, tmp); ++++ ++++ a += 16; ++++ c += 16; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 16); ++i) ++++ { ++++ *c++ = lv_conj(*a++); ++++ } ++++} ++++#endif /* LV_HAVE_AVX */ ++++ ++++#ifdef LV_HAVE_SSSE3 ++++#include "tmmintrin.h" ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ __m128i tmp; ++++ ++++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); ++++ ++++ for (int i = 0; i < sse_iters; ++i) ++++ { ++++ tmp = _mm_lddqu_si128((__m128i*)a); ++++ tmp = _mm_sign_epi8(tmp, conjugator); ++++ _mm_storeu_si128((__m128i*)c, tmp); ++++ a += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = lv_conj(*a++); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSSE3 */ ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ __m128i tmp; ++++ ++++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); ++++ ++++ for (int i = 0; i < sse_iters; ++i) ++++ { ++++ tmp = _mm_lddqu_si128((__m128i*)a); ++++ tmp = _mm_xor_si128(tmp, conjugator1); ++++ tmp = _mm_add_epi8(tmp, conjugator2); ++++ _mm_storeu_si128((__m128i*)c, tmp); ++++ a += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = lv_conj(*a++); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ lv_8sc_t* cPtr = cVector; ++++ const lv_8sc_t* aPtr = aVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = lv_conj(*aPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H ++++#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include "immintrin.h" ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ ++++ __m256 tmp; ++++ __m128i tmp128lo, tmp128hi; ++++ __m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255)); ++++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); ++++ ++++ for (int i = 0; i < sse_iters; ++i) ++++ { ++++ tmp = _mm256_load_ps((float*)a); ++++ tmp = _mm256_xor_ps(tmp, conjugator1); ++++ tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); ++++ tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); ++++ tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); ++++ tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); ++++ //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h ++++ tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1); ++++ _mm256_store_ps((float*)c, tmp); ++++ ++++ a += 16; ++++ c += 16; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 16); ++i) ++++ { ++++ *c++ = lv_conj(*a++); ++++ } ++++} ++++#endif /* LV_HAVE_AVX */ ++++ ++++#ifdef LV_HAVE_SSSE3 ++++#include "tmmintrin.h" ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ __m128i tmp; ++++ ++++ __m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1); ++++ ++++ for (int i = 0; i < sse_iters; ++i) ++++ { ++++ tmp = _mm_load_si128((__m128i*)a); ++++ tmp = _mm_sign_epi8(tmp, conjugator); ++++ _mm_store_si128((__m128i*)c, tmp); ++++ a += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = lv_conj(*a++); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSSE3 */ ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ __m128i tmp; ++++ ++++ __m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ __m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); ++++ ++++ for (int i = 0; i < sse_iters; ++i) ++++ { ++++ tmp = _mm_load_si128((__m128i*)a); ++++ tmp = _mm_xor_si128(tmp, conjugator1); ++++ tmp = _mm_add_epi8(tmp, conjugator2); ++++ _mm_store_si128((__m128i*)c, tmp); ++++ a += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = lv_conj(*a++); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ lv_8sc_t* cPtr = cVector; ++++ const lv_8sc_t* aPtr = aVector; ++++ unsigned int number = 0; ++++ ++++ for(number = 0; number < num_points; number++){ ++++ *cPtr++ = lv_conj(*aPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Takes the conjugate of an unsigned char vector. ++++ \param cVector The vector where the results will be stored ++++ \param aVector Vector to be conjugated ++++ \param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector ++++ */ ++++extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points); ++++static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){ ++++ volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,320 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_magnitude_squared_8i.h ++++ * \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that calculates the magnitude squared of a ++++ * 16 bits vector (8 bits the real part and 8 bits the imaginary part) ++++ * result = (real*real) + (imag*imag) ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H ++++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++#include "tmmintrin.h" ++++/*! ++++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ const char* complexVectorPtr = (char*)complexVector; ++++ char* magnitudeVectorPtr = magnitudeVector; ++++ ++++ __m128i zero, result8; ++++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; ++++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; ++++ ++++ zero = _mm_setzero_si128(); ++++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++++ ++++ for(int number = 0;number < sse_iters; number++) ++++ { ++++ avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); ++++ avectorlo = _mm_unpacklo_epi8 (avector, zero); ++++ avectorhi = _mm_unpackhi_epi8 (avector, zero); ++++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); ++++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); ++++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); ++++ ++++ complexVectorPtr += 16; ++++ ++++ bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); ++++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero); ++++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero); ++++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); ++++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); ++++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); ++++ ++++ complexVectorPtr += 16; ++++ ++++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); ++++ ++++ _mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8); ++++ ++++ magnitudeVectorPtr += 16; ++++ ++++ ++++ } ++++ ++++ for (int i = 0; i<(num_points % 16); ++i) ++++ { ++++ const char valReal = *complexVectorPtr++; ++++ const char valImag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); ++++ } ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++//#ifdef LV_HAVE_SSE ++++//#include ++++///*! ++++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++++// \param complexVector The vector containing the complex input values ++++// \param magnitudeVector The vector containing the real output values ++++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++// */ ++++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++++// unsigned int number = 0; ++++// const unsigned int quarterPoints = num_points / 4; ++++// ++++// const float* complexVectorPtr = (float*)complexVector; ++++// float* magnitudeVectorPtr = magnitudeVector; ++++// ++++// __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++++// for(;number < quarterPoints; number++){ ++++// cplxValue1 = _mm_loadu_ps(complexVectorPtr); ++++// complexVectorPtr += 4; ++++// ++++// cplxValue2 = _mm_loadu_ps(complexVectorPtr); ++++// complexVectorPtr += 4; ++++// ++++// // Arrange in i1i2i3i4 format ++++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++++// // Arrange in q1q2q3q4 format ++++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++++// ++++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++++// ++++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++++// ++++// _mm_storeu_ps(magnitudeVectorPtr, result); ++++// magnitudeVectorPtr += 4; ++++// } ++++// ++++// number = quarterPoints * 4; ++++// for(; number < num_points; number++){ ++++// float val1Real = *complexVectorPtr++; ++++// float val1Imag = *complexVectorPtr++; ++++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++++// } ++++//} ++++//#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++++ const char* complexVectorPtr = (char*)complexVector; ++++ char* magnitudeVectorPtr = magnitudeVector; ++++ ++++ for(int number = 0; number < num_points; number++){ ++++ const char real = *complexVectorPtr++; ++++ const char imag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H ++++#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ const char* complexVectorPtr = (char*)complexVector; ++++ char* magnitudeVectorPtr = magnitudeVector; ++++ ++++ __m128i zero, result8; ++++ __m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska; ++++ __m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb; ++++ ++++ zero = _mm_setzero_si128(); ++++ maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); ++++ maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); ++++ ++++ for(int number = 0;number < sse_iters; number++) ++++ { ++++ avector = _mm_load_si128((__m128i*)complexVectorPtr); ++++ avectorlo = _mm_unpacklo_epi8 (avector, zero); ++++ avectorhi = _mm_unpackhi_epi8 (avector, zero); ++++ avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); ++++ avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); ++++ aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); ++++ ++++ complexVectorPtr += 16; ++++ ++++ bvector = _mm_load_si128((__m128i*)complexVectorPtr); ++++ bvectorlo = _mm_unpacklo_epi8 (bvector, zero); ++++ bvectorhi = _mm_unpackhi_epi8 (bvector, zero); ++++ bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); ++++ bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); ++++ badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); ++++ ++++ complexVectorPtr += 16; ++++ ++++ result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb)); ++++ ++++ _mm_store_si128((__m128i*)magnitudeVectorPtr, result8); ++++ ++++ magnitudeVectorPtr += 16; ++++ ++++ ++++ } ++++ ++++ for (int i = 0; i<(num_points % 16); ++i) ++++ { ++++ const char valReal = *complexVectorPtr++; ++++ const char valImag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag); ++++ } ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++//#ifdef LV_HAVE_SSE ++++//#include ++++///*! ++++// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++++// \param complexVector The vector containing the complex input values ++++// \param magnitudeVector The vector containing the real output values ++++// \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++// */ ++++//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){ ++++// unsigned int number = 0; ++++// const unsigned int quarterPoints = num_points / 4; ++++// ++++// const float* complexVectorPtr = (float*)complexVector; ++++// float* magnitudeVectorPtr = magnitudeVector; ++++// ++++// __m128 cplxValue1, cplxValue2, iValue, qValue, result; ++++// for(;number < quarterPoints; number++){ ++++// cplxValue1 = _mm_load_ps(complexVectorPtr); ++++// complexVectorPtr += 4; ++++// ++++// cplxValue2 = _mm_load_ps(complexVectorPtr); ++++// complexVectorPtr += 4; ++++// ++++// // Arrange in i1i2i3i4 format ++++// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); ++++// // Arrange in q1q2q3q4 format ++++// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); ++++// ++++// iValue = _mm_mul_ps(iValue, iValue); // Square the I values ++++// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values ++++// ++++// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values ++++// ++++// _mm_store_ps(magnitudeVectorPtr, result); ++++// magnitudeVectorPtr += 4; ++++// } ++++// ++++// number = quarterPoints * 4; ++++// for(; number < num_points; number++){ ++++// float val1Real = *complexVectorPtr++; ++++// float val1Imag = *complexVectorPtr++; ++++// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag); ++++// } ++++//} ++++//#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++++ const char* complexVectorPtr = (char*)complexVector; ++++ char* magnitudeVectorPtr = magnitudeVector; ++++ ++++ for(int number = 0; number < num_points; number++){ ++++ const char real = *complexVectorPtr++; ++++ const char imag = *complexVectorPtr++; ++++ *magnitudeVectorPtr++ = (real*real) + (imag*imag); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector ++++ \param complexVector The vector containing the complex input values ++++ \param magnitudeVector The vector containing the real output values ++++ \param num_points The number of complex values in complexVector to be calculated and stored into cVector ++++ */ ++++extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points); ++++static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){ ++++ volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,271 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h ++++ * \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that multiplies a group of 16 bits vectors ++++ * (8 bits the real part and 8 bits the imaginary part) by one constant vector ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H ++++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector The vector to be multiplied ++++ \param scalar The complex scalar to multiply aVector ++++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++++ ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ y = _mm_set1_epi16 (*(short*)&scalar); ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ x = _mm_lddqu_si128((__m128i*)a); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ realc = _mm_and_si128 (realc, mult1); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ imagc = _mm_and_si128 (imagc, mult1); ++++ imagc = _mm_slli_si128 (imagc, 1); ++++ ++++ totalc = _mm_or_si128 (realc, imagc); ++++ ++++ _mm_storeu_si128((__m128i*)c, totalc); ++++ ++++ a += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = (*a++) * scalar; ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector The vector to be multiplied ++++ \param scalar The complex scalar to multiply aVector ++++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++++ ++++ /*lv_8sc_t* cPtr = cVector; ++++ const lv_8sc_t* aPtr = aVector; ++++ ++++ for (int i = 0; i= 8){ ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ number -= 8; ++++ } ++++ ++++ // clean up any remaining ++++ while (number-- > 0) ++++ *cPtr++ = *aPtr++ * scalar; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H ++++#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++/*! ++++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector The vector to be multiplied ++++ \param scalar The complex scalar to multiply aVector ++++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++++ ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ y = _mm_set1_epi16 (*(short*)&scalar); ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ x = _mm_load_si128((__m128i*)a); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ realc = _mm_and_si128 (realc, mult1); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ imagc = _mm_and_si128 (imagc, mult1); ++++ imagc = _mm_slli_si128 (imagc, 1); ++++ ++++ totalc = _mm_or_si128 (realc, imagc); ++++ ++++ _mm_store_si128((__m128i*)c, totalc); ++++ ++++ a += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = (*a++) * scalar; ++++ } ++++ ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector The vector to be multiplied ++++ \param scalar The complex scalar to multiply aVector ++++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++++ ++++ /*lv_8sc_t* cPtr = cVector; ++++ const lv_8sc_t* aPtr = aVector; ++++ ++++ for (int i = 0; i= 8){ ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ *cPtr++ = (*aPtr++) * scalar; ++++ number -= 8; ++++ } ++++ ++++ // clean up any remaining ++++ while (number-- > 0) ++++ *cPtr++ = *aPtr++ * scalar; ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Multiplies the input vector by a scalar and stores the results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector The vector to be multiplied ++++ \param scalar The complex scalar to multiply aVector ++++ \param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector ++++ */ ++++extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points); ++++static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){ ++++ volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,499 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h ++++ * \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part ++++ * and 8 bits the imaginary part) and accumulates them ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H ++++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++++ \param cVector The vector where the accumulated result will be stored ++++ \param aVector One of the vectors to be multiplied and accumulated ++++ \param bVector One of the vectors to be multiplied and accumulated ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++++ ++++ /*lv_8sc_t* cPtr = result; ++++ const lv_8sc_t* aPtr = input; ++++ const lv_8sc_t* bPtr = taps; ++++ ++++ for(int number = 0; number < num_points; number++){ ++++ *cPtr += (*aPtr++) * (*bPtr++); ++++ }*/ ++++ ++++ char * res = (char*) result; ++++ char * in = (char*) input; ++++ char * tp = (char*) taps; ++++ unsigned int n_2_ccomplex_blocks = num_points/2; ++++ unsigned int isodd = num_points & 1; ++++ ++++ char sum0[2] = {0,0}; ++++ char sum1[2] = {0,0}; ++++ unsigned int i = 0; ++++ ++++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++++ ++++ in += 4; ++++ tp += 4; ++++ } ++++ ++++ res[0] = sum0[0] + sum1[0]; ++++ res[1] = sum0[1] + sum1[1]; ++++ ++++ // Cleanup if we had an odd number of points ++++ for(i = 0; i < isodd; ++i) { ++++ *result += input[num_points - 1] * taps[num_points - 1]; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++/*! ++++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++++ \param cVector The vector where the accumulated result will be stored ++++ \param aVector One of the vectors to be multiplied and accumulated ++++ \param bVector One of the vectors to be multiplied and accumulated ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++++ ++++ lv_8sc_t dotProduct; ++++ memset(&dotProduct, 0x0, 2*sizeof(char)); ++++ ++++ const lv_8sc_t* a = input; ++++ const lv_8sc_t* b = taps; ++++ ++++ const unsigned int sse_iters = num_points/8; ++++ ++++ if (sse_iters>0) ++++ { ++++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ realcacc = _mm_setzero_si128(); ++++ imagcacc = _mm_setzero_si128(); ++++ ++++ for(int number = 0; number < sse_iters; number++){ ++++ ++++ x = _mm_lddqu_si128((__m128i*)a); ++++ y = _mm_lddqu_si128((__m128i*)b); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ realcacc = _mm_add_epi16 (realcacc, realc); ++++ imagcacc = _mm_add_epi16 (imagcacc, imagc); ++++ ++++ a += 8; ++++ b += 8; ++++ } ++++ ++++ realcacc = _mm_and_si128 (realcacc, mult1); ++++ imagcacc = _mm_and_si128 (imagcacc, mult1); ++++ imagcacc = _mm_slli_si128 (imagcacc, 1); ++++ ++++ totalc = _mm_or_si128 (realcacc, imagcacc); ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; ++++ ++++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<8; ++i) ++++ { ++++ dotProduct += dotProductVector[i]; ++++ } ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ dotProduct += (*a++) * (*b++); ++++ } ++++ ++++ *result = dotProduct; ++++} ++++ ++++#endif /*LV_HAVE_SSE2*/ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++++ \param cVector The vector where the accumulated result will be stored ++++ \param aVector One of the vectors to be multiplied and accumulated ++++ \param bVector One of the vectors to be multiplied and accumulated ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++++ ++++ lv_8sc_t dotProduct; ++++ memset(&dotProduct, 0x0, 2*sizeof(char)); ++++ ++++ const lv_8sc_t* a = input; ++++ const lv_8sc_t* b = taps; ++++ ++++ const unsigned int sse_iters = num_points/8; ++++ ++++ if (sse_iters>0) ++++ { ++++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ realcacc = _mm_setzero_si128(); ++++ imagcacc = _mm_setzero_si128(); ++++ ++++ for(int number = 0; number < sse_iters; number++){ ++++ ++++ x = _mm_lddqu_si128((__m128i*)a); ++++ y = _mm_lddqu_si128((__m128i*)b); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ realcacc = _mm_add_epi16 (realcacc, realc); ++++ imagcacc = _mm_add_epi16 (imagcacc, imagc); ++++ ++++ a += 8; ++++ b += 8; ++++ } ++++ ++++ imagcacc = _mm_slli_si128 (imagcacc, 1); ++++ ++++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; ++++ ++++ _mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<8; ++i) ++++ { ++++ dotProduct += dotProductVector[i]; ++++ } ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ dotProduct += (*a++) * (*b++); ++++ } ++++ ++++ *result = dotProduct; ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H*/ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H ++++#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++++ \param cVector The vector where the accumulated result will be stored ++++ \param aVector One of the vectors to be multiplied and accumulated ++++ \param bVector One of the vectors to be multiplied and accumulated ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++++ ++++ /*lv_8sc_t* cPtr = result; ++++ const lv_8sc_t* aPtr = input; ++++ const lv_8sc_t* bPtr = taps; ++++ ++++ for(int number = 0; number < num_points; number++){ ++++ *cPtr += (*aPtr++) * (*bPtr++); ++++ }*/ ++++ ++++ char * res = (char*) result; ++++ char * in = (char*) input; ++++ char * tp = (char*) taps; ++++ unsigned int n_2_ccomplex_blocks = num_points/2; ++++ unsigned int isodd = num_points & 1; ++++ ++++ char sum0[2] = {0,0}; ++++ char sum1[2] = {0,0}; ++++ unsigned int i = 0; ++++ ++++ for(i = 0; i < n_2_ccomplex_blocks; ++i) { ++++ sum0[0] += in[0] * tp[0] - in[1] * tp[1]; ++++ sum0[1] += in[0] * tp[1] + in[1] * tp[0]; ++++ sum1[0] += in[2] * tp[2] - in[3] * tp[3]; ++++ sum1[1] += in[2] * tp[3] + in[3] * tp[2]; ++++ ++++ in += 4; ++++ tp += 4; ++++ } ++++ ++++ res[0] = sum0[0] + sum1[0]; ++++ res[1] = sum0[1] + sum1[1]; ++++ ++++ // Cleanup if we had an odd number of points ++++ for(i = 0; i < isodd; ++i) { ++++ *result += input[num_points - 1] * taps[num_points - 1]; ++++ } ++++} ++++ ++++#endif /*LV_HAVE_GENERIC*/ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++/*! ++++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++++ \param cVector The vector where the accumulated result will be stored ++++ \param aVector One of the vectors to be multiplied and accumulated ++++ \param bVector One of the vectors to be multiplied and accumulated ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++++ ++++ lv_8sc_t dotProduct; ++++ memset(&dotProduct, 0x0, 2*sizeof(char)); ++++ ++++ const lv_8sc_t* a = input; ++++ const lv_8sc_t* b = taps; ++++ ++++ const unsigned int sse_iters = num_points/8; ++++ ++++ if (sse_iters>0) ++++ { ++++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ realcacc = _mm_setzero_si128(); ++++ imagcacc = _mm_setzero_si128(); ++++ ++++ for(int number = 0; number < sse_iters; number++){ ++++ ++++ x = _mm_load_si128((__m128i*)a); ++++ y = _mm_load_si128((__m128i*)b); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ realcacc = _mm_add_epi16 (realcacc, realc); ++++ imagcacc = _mm_add_epi16 (imagcacc, imagc); ++++ ++++ a += 8; ++++ b += 8; ++++ } ++++ ++++ realcacc = _mm_and_si128 (realcacc, mult1); ++++ imagcacc = _mm_and_si128 (imagcacc, mult1); ++++ imagcacc = _mm_slli_si128 (imagcacc, 1); ++++ ++++ totalc = _mm_or_si128 (realcacc, imagcacc); ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; ++++ ++++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<8; ++i) ++++ { ++++ dotProduct += dotProductVector[i]; ++++ } ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ dotProduct += (*a++) * (*b++); ++++ } ++++ ++++ *result = dotProduct; ++++} ++++ ++++#endif /*LV_HAVE_SSE2*/ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++++ \param cVector The vector where the accumulated result will be stored ++++ \param aVector One of the vectors to be multiplied and accumulated ++++ \param bVector One of the vectors to be multiplied and accumulated ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) { ++++ ++++ lv_8sc_t dotProduct; ++++ memset(&dotProduct, 0x0, 2*sizeof(char)); ++++ ++++ const lv_8sc_t* a = input; ++++ const lv_8sc_t* b = taps; ++++ ++++ const unsigned int sse_iters = num_points/8; ++++ ++++ if (sse_iters>0) ++++ { ++++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ realcacc = _mm_setzero_si128(); ++++ imagcacc = _mm_setzero_si128(); ++++ ++++ for(int number = 0; number < sse_iters; number++){ ++++ ++++ x = _mm_load_si128((__m128i*)a); ++++ y = _mm_load_si128((__m128i*)b); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ realcacc = _mm_add_epi16 (realcacc, realc); ++++ imagcacc = _mm_add_epi16 (imagcacc, imagc); ++++ ++++ a += 8; ++++ b += 8; ++++ } ++++ ++++ imagcacc = _mm_slli_si128 (imagcacc, 1); ++++ ++++ totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; ++++ ++++ _mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<8; ++i) ++++ { ++++ dotProduct += dotProductVector[i]; ++++ } ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ dotProduct += (*a++) * (*b++); ++++ } ++++ ++++ *result = dotProduct; ++++} ++++ ++++#endif /*LV_HAVE_SSE4_1*/ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector ++++ \param cVector The vector where the accumulated result will be stored ++++ \param aVector One of the vectors to be multiplied and accumulated ++++ \param bVector One of the vectors to be multiplied and accumulated ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector ++++ */ ++++extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points); ++++static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){ ++++ ++++ short resReal = 0; ++++ char* resRealChar = (char*)&resReal; ++++ resRealChar++; ++++ ++++ short resImag = 0; ++++ char* resImagChar = (char*)&resImag; ++++ resImagChar++; ++++ ++++ volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points); ++++ ++++ *result = lv_cmake(*resRealChar, *resImagChar); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H*/ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,346 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_x2_multiply_8ic.h ++++ * \brief Volk protokernel: multiplies two 16 bits vectors ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that multiplies two 16 bits vectors (8 bits the real part ++++ * and 8 bits the imaginary part) ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H ++++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++/*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ const lv_8sc_t* b = bVector; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ x = _mm_lddqu_si128((__m128i*)a); ++++ y = _mm_lddqu_si128((__m128i*)b); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ realc = _mm_and_si128 (realc, mult1); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ imagc = _mm_and_si128 (imagc, mult1); ++++ imagc = _mm_slli_si128 (imagc, 1); ++++ ++++ totalc = _mm_or_si128 (realc, imagc); ++++ ++++ _mm_storeu_si128((__m128i*)c, totalc); ++++ ++++ a += 8; ++++ b += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = (*a++) * (*b++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, zero; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ const lv_8sc_t* b = bVector; ++++ ++++ zero = _mm_setzero_si128(); ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ x = _mm_lddqu_si128((__m128i*)a); ++++ y = _mm_lddqu_si128((__m128i*)b); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ imagc = _mm_slli_si128 (imagc, 1); ++++ ++++ totalc = _mm_blendv_epi8 (imagc, realc, mult1); ++++ ++++ _mm_storeu_si128((__m128i*)c, totalc); ++++ ++++ a += 8; ++++ b += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = (*a++) * (*b++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++++ lv_8sc_t* cPtr = cVector; ++++ const lv_8sc_t* aPtr = aVector; ++++ const lv_8sc_t* bPtr = bVector; ++++ ++++ for(int number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) * (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H ++++#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++/*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ const lv_8sc_t* b = bVector; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ x = _mm_load_si128((__m128i*)a); ++++ y = _mm_load_si128((__m128i*)b); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ realc = _mm_and_si128 (realc, mult1); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ imagc = _mm_and_si128 (imagc, mult1); ++++ imagc = _mm_slli_si128 (imagc, 1); ++++ ++++ totalc = _mm_or_si128 (realc, imagc); ++++ ++++ _mm_store_si128((__m128i*)c, totalc); ++++ ++++ a += 8; ++++ b += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = (*a++) * (*b++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, zero; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc; ++++ lv_8sc_t* c = cVector; ++++ const lv_8sc_t* a = aVector; ++++ const lv_8sc_t* b = bVector; ++++ ++++ zero = _mm_setzero_si128(); ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ x = _mm_load_si128((__m128i*)a); ++++ y = _mm_load_si128((__m128i*)b); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ imagc = _mm_slli_si128 (imagc, 1); ++++ ++++ totalc = _mm_blendv_epi8 (imagc, realc, mult1); ++++ ++++ _mm_store_si128((__m128i*)c, totalc); ++++ ++++ a += 8; ++++ b += 8; ++++ c += 8; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 8); ++i) ++++ { ++++ *c++ = (*a++) * (*b++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++++ lv_8sc_t* cPtr = cVector; ++++ const lv_8sc_t* aPtr = aVector; ++++ const lv_8sc_t* bPtr = bVector; ++++ ++++ for(int number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) * (*bPtr++); ++++ } ++++ ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Multiplies the two input complex vectors and stores their results in the third vector ++++ \param cVector The vector where the results will be stored ++++ \param aVector One of the vectors to be multiplied ++++ \param bVector One of the vectors to be multiplied ++++ \param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector ++++ */ ++++extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points); ++++static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){ ++++ volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,613 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the ++++ * real part and 8 bits the imaginary part), and accumulates the result ++++ * in 32 bits single point values, returning float32 values: ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 16 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 E_code_acc, P_code_acc, L_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; ++++ __m128 output_ps; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ E_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++++ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ ++++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 E_code_acc, P_code_acc, L_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 output_ps_1, output_ps_2; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ E_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++++ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ ++++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ lv_8sc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get early, late, and prompt values for each ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 E_code_acc, P_code_acc, L_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; ++++ __m128 output_ps; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ E_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_load_si128((__m128i*)input_ptr); ++++ y = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++++ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get early values ++++ y = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); ++++ ++++ //Get late values ++++ y = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ ++++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 E_code_acc, P_code_acc, L_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 output_ps_1, output_ps_2; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ E_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_load_si128((__m128i*)input_ptr); ++++ y = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++++ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get early values ++++ y = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++++ ++++ //Get prompt values ++++ y = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++++ ++++ //Get late values ++++ y = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ ++++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ lv_8sc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get early, late, and prompt values for each ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */ +++\ No newline at end of file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,874 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the ++++ * real part and 8 bits the imaginary part): ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 16 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 16 bits vectors), accumulating the results ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 16 bits vectors), accumulating the results ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 16 bits vectors), accumulating the results ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++ /*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_8sc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_8sc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_8sc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ real_E_code_acc = _mm_setzero_si128(); ++++ imag_E_code_acc = _mm_setzero_si128(); ++++ real_L_code_acc = _mm_setzero_si128(); ++++ imag_L_code_acc = _mm_setzero_si128(); ++++ real_P_code_acc = _mm_setzero_si128(); ++++ imag_P_code_acc = _mm_setzero_si128(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); ++++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); ++++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); ++++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ E_code_ptr += 8; ++++ L_code_ptr += 8; ++++ P_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; ++++ ++++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); ++++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); ++++ _mm_storeu_si128((__m128i*)E_dotProductVector, output); ++++ ++++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); ++++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); ++++ _mm_storeu_si128((__m128i*)L_dotProductVector, output); ++++ ++++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); ++++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); ++++ _mm_storeu_si128((__m128i*)P_dotProductVector, output); ++++ ++++ for (int i = 0; i<8; ++i) ++++ { ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); ++++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); ++++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_8sc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_8sc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_8sc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ real_E_code_acc = _mm_setzero_si128(); ++++ imag_E_code_acc = _mm_setzero_si128(); ++++ real_L_code_acc = _mm_setzero_si128(); ++++ imag_L_code_acc = _mm_setzero_si128(); ++++ real_P_code_acc = _mm_setzero_si128(); ++++ imag_P_code_acc = _mm_setzero_si128(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); ++++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); ++++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); ++++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ E_code_ptr += 8; ++++ L_code_ptr += 8; ++++ P_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; ++++ ++++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); ++++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); ++++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); ++++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); ++++ _mm_storeu_si128((__m128i*)E_dotProductVector, output); ++++ ++++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); ++++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); ++++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); ++++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); ++++ _mm_storeu_si128((__m128i*)L_dotProductVector, output); ++++ ++++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); ++++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); ++++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); ++++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); ++++ _mm_storeu_si128((__m128i*)P_dotProductVector, output); ++++ ++++ for (int i = 0; i<8; ++i) ++++ { ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); ++++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); ++++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ lv_8sc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get early, late, and prompt values for each ++++ *E_out += bb_signal_sample * E_code[i]; ++++ *P_out += bb_signal_sample * P_code[i]; ++++ *L_out += bb_signal_sample * L_code[i]; ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_8sc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_8sc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_8sc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ real_E_code_acc = _mm_setzero_si128(); ++++ imag_E_code_acc = _mm_setzero_si128(); ++++ real_L_code_acc = _mm_setzero_si128(); ++++ imag_L_code_acc = _mm_setzero_si128(); ++++ real_P_code_acc = _mm_setzero_si128(); ++++ imag_P_code_acc = _mm_setzero_si128(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_load_si128((__m128i*)input_ptr); ++++ y = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ //Get early values ++++ y = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); ++++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); ++++ ++++ //Get late values ++++ y = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); ++++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); ++++ ++++ //Get prompt values ++++ y = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); ++++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ E_code_ptr += 8; ++++ L_code_ptr += 8; ++++ P_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; ++++ ++++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); ++++ output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1); ++++ _mm_store_si128((__m128i*)E_dotProductVector, output); ++++ ++++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); ++++ output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1); ++++ _mm_store_si128((__m128i*)L_dotProductVector, output); ++++ ++++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); ++++ output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1); ++++ _mm_store_si128((__m128i*)P_dotProductVector, output); ++++ ++++ for (int i = 0; i<8; ++i) ++++ { ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); ++++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); ++++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_8sc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_8sc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_8sc_t* P_out_ptr = P_out; ++++ ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ real_E_code_acc = _mm_setzero_si128(); ++++ imag_E_code_acc = _mm_setzero_si128(); ++++ real_L_code_acc = _mm_setzero_si128(); ++++ imag_L_code_acc = _mm_setzero_si128(); ++++ real_P_code_acc = _mm_setzero_si128(); ++++ imag_P_code_acc = _mm_setzero_si128(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_load_si128((__m128i*)input_ptr); ++++ y = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ //Get early values ++++ y = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output); ++++ imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output); ++++ ++++ //Get late values ++++ y = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output); ++++ imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output); ++++ ++++ //Get prompt values ++++ y = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output); ++++ imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ E_code_ptr += 8; ++++ L_code_ptr += 8; ++++ P_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8]; ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8]; ++++ __VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8]; ++++ ++++ real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1); ++++ imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1); ++++ imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1); ++++ output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc); ++++ _mm_store_si128((__m128i*)E_dotProductVector, output); ++++ ++++ real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1); ++++ imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1); ++++ imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1); ++++ output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc); ++++ _mm_store_si128((__m128i*)L_dotProductVector, output); ++++ ++++ real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1); ++++ imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1); ++++ imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1); ++++ output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc); ++++ _mm_store_si128((__m128i*)P_dotProductVector, output); ++++ ++++ for (int i = 0; i<8; ++i) ++++ { ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get early, late, and prompt values for each ++++ *E_out_ptr += bb_signal_sample * (*E_code_ptr++); ++++ *P_out_ptr += bb_signal_sample * (*P_code_ptr++); ++++ *L_out_ptr += bb_signal_sample * (*L_code_ptr++); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points) ++++{ ++++ lv_8sc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ // perform Early, Prompt and Late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get early, late, and prompt values for each ++++ *E_out += bb_signal_sample * E_code[i]; ++++ *P_out += bb_signal_sample * P_code[i]; ++++ *L_out += bb_signal_sample * L_code[i]; ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param E_code Early PRN code replica input ++++ \param P_code Early PRN code replica input ++++ \param L_code Early PRN code replica input ++++ \param E_out Early correlation output ++++ \param P_out Early correlation output ++++ \param L_out Early correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++ ++++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points); ++++extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points); ++++static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){ ++++ ++++ short E_out_real = 0; ++++ short E_out_imag = 0; ++++ char* E_out_real_c = (char*)&E_out_real; ++++ E_out_real_c++; ++++ char* E_out_imag_c = (char*)&E_out_imag; ++++ E_out_imag_c++; ++++ ++++ short P_out_real = 0; ++++ short P_out_imag = 0; ++++ char* P_out_real_c = (char*)&P_out_real; ++++ P_out_real_c++; ++++ char* P_out_imag_c = (char*)&P_out_imag; ++++ P_out_imag_c++; ++++ ++++ short L_out_real = 0; ++++ short L_out_imag = 0; ++++ char* L_out_real_c = (char*)&L_out_real; ++++ L_out_real_c++; ++++ char* L_out_imag_c = (char*)&L_out_imag; ++++ L_out_imag_c++; ++++ ++++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points); ++++ volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points); ++++ ++++ //ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that ++++ //in one function the length of the code gives memory problems (bad access, segmentation fault). ++++ //Also, the maximum number of accumulators that can be used is 4 (and we need 6). ++++ //The "carrier wipe-off" step is done two times: one in the first function and another one in the second. ++++ //Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just ++++ //one time. ++++ ++++ *E_out = lv_cmake(*E_out_real_c, *E_out_imag_c); ++++ *P_out = lv_cmake(*P_out_real_c, *P_out_imag_c); ++++ *L_out = lv_cmake(*L_out_real_c, *L_out_imag_c); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,797 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 (2 bits). ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the ++++ * real part and 8 bits the imaginary part), and accumulates the result ++++ * in 32 bits single point values, returning float32 values: ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 16 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Very Early values are calculated by multiplying the input signal in BB by the ++++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Very Late values are calculated by multiplying the input signal in BB by the ++++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * ++++ * ------------------------------------------------------------------------- ++++ * Bits analysis ++++ * ++++ * input = 8 bits ++++ * carrier = 8 bits ++++ * XX_code = 8 bits ++++ * XX_out = 8 bits ++++ * bb_signal_sample = 8 bits ++++ * ++++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) ++++ * ++++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between —7 and 7 to avoid overflow (3 bits) ++++ * ++++ * conclusion = input and carrier must be values between —1 and 1 (1 bit) and XX_code must be values between —7 and 7 to avoid overflow (3 bits) ++++ * If input, carrier and XX_code have the same number of bits, they must be values between —3 and 3 to avoid overflow (2 bits). ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; ++++ __m128 output_ps; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ *VE_out_ptr = 0; ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ *VL_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ VE_code_acc = _mm_setzero_ps(); ++++ E_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++++ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get very early values ++++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); ++++ ++++ //Get very late values ++++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++++ ++++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *VE_out_ptr += VE_dotProductVector[i]; ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *VL_out_ptr += VL_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 output_ps_1, output_ps_2; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ *VE_out_ptr = 0; ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ *VL_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ VE_code_acc = _mm_setzero_ps(); ++++ E_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++++ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get very early values ++++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++++ ++++ //Get very late values ++++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++++ ++++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *VE_out_ptr += VE_dotProductVector[i]; ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *VL_out_ptr += VL_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ lv_8sc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ // perform very early, Early, Prompt, Late and very late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ ++++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2; ++++ __m128 output_ps; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ *VE_out_ptr = 0; ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ *VL_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ VE_code_acc = _mm_setzero_ps(); ++++ E_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_load_si128((__m128i*)input_ptr); ++++ y = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++++ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get very early values ++++ y = _mm_load_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps); ++++ ++++ //Get early values ++++ y = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps); ++++ ++++ //Get late values ++++ y = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps); ++++ ++++ //Get very late values ++++ y = _mm_load_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps) ++++ ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++++ ++++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *VE_out_ptr += VE_dotProductVector[i]; ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *VL_out_ptr += VL_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include "emmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 output_ps_1, output_ps_2; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ *VE_out_ptr = 0; ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ *VL_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ VE_code_acc = _mm_setzero_ps(); ++++ E_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_load_si128((__m128i*)input_ptr); ++++ y = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx) ++++ CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy) ++++ ++++ CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample) ++++ ++++ //Get very early values ++++ y = _mm_load_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); ++++ ++++ //Get early values ++++ y = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++++ ++++ //Get prompt values ++++ y = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++++ ++++ //Get late values ++++ y = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++++ ++++ //Get very late values ++++ y = _mm_load_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2) ++++ ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++++ ++++ _mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *VE_out_ptr += VE_dotProductVector[i]; ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *VL_out_ptr += VL_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ lv_8sc_t bb_signal_sample; ++++ ++++ bb_signal_sample = lv_cmake(0, 0); ++++ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ // perform very early, Early, Prompt, Late and very late correlation ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ ++++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H */ +++\ No newline at end of file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,1520 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors using different methods: inside u_sse4_1_first there is one method, inside u_sse4_1_second there is another... This protokernel has been created to test the performance of different methods. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the ++++ * real part and 8 bits the imaginary part), and accumulates the result ++++ * in 32 bits single point values, returning float32 values: ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 16 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Very Early values are calculated by multiplying the input signal in BB by the ++++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Very Late values are calculated by multiplying the input signal in BB by the ++++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_first(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, y, real_bb_signal_sample, imag_bb_signal_sample; ++++ __m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output; ++++ ++++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 output_ps_1, output_ps_2; ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ *VE_out_ptr = 0; ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ *VL_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ VE_code_acc = _mm_setzero_ps(); ++++ E_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ imagx = _mm_srli_si128 (x, 1); ++++ imagx = _mm_and_si128 (imagx, mult1); ++++ realx = _mm_and_si128 (x, mult1); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (realx, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (realx, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imagx, realy); ++++ ++++ real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ //Get very early values ++++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++++ ++++ //Get very late values ++++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ ++++ imagy = _mm_srli_si128 (y, 1); ++++ imagy = _mm_and_si128 (imagy, mult1); ++++ realy = _mm_and_si128 (y, mult1); ++++ ++++ realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy); ++++ imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy); ++++ realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy); ++++ imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy); ++++ ++++ real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy); ++++ imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++++ ++++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *VE_out_ptr += VE_dotProductVector[i]; ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *VL_out_ptr += VL_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_second(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++++ __m128i mult1, output, real_output, imag_output; ++++ ++++ __m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 output_ps_1, output_ps_2; ++++ ++++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ *VE_out_ptr = 0; ++++ *E_out_ptr = 0; ++++ *P_out_ptr = 0; ++++ *L_out_ptr = 0; ++++ *VL_out_ptr = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ VE_code_acc = _mm_setzero_ps(); ++++ E_code_acc = _mm_setzero_ps(); ++++ P_code_acc = _mm_setzero_ps(); ++++ L_code_acc = _mm_setzero_ps(); ++++ VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ x_abs = _mm_abs_epi8 (x); ++++ ++++ y_aux = _mm_sign_epi8 (y, x); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++++ y_aux = _mm_sign_epi8 (y_aux, x); ++++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++++ ++++ //Get very early values ++++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1); ++++ VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_1); ++++ E_code_acc = _mm_add_ps (E_code_acc, output_ps_2); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_1); ++++ P_code_acc = _mm_add_ps (P_code_acc, output_ps_2); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_1); ++++ L_code_acc = _mm_add_ps (L_code_acc, output_ps_2); ++++ ++++ //Get very late values ++++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ output = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_1 = _mm_cvtepi32_ps(output_i32); ++++ ++++ input_i_1 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ input_i_2 = _mm_cvtepi8_epi32(output); ++++ output = _mm_srli_si128 (output, 4); ++++ output_i32 = _mm_add_epi32 (input_i_1, input_i_2); ++++ output_ps_2 = _mm_cvtepi32_ps(output_i32); ++++ ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1); ++++ VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2]; ++++ __VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2]; ++++ ++++ _mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<2; ++i) ++++ { ++++ *VE_out_ptr += VE_dotProductVector[i]; ++++ *E_out_ptr += E_dotProductVector[i]; ++++ *P_out_ptr += P_dotProductVector[i]; ++++ *L_out_ptr += L_dotProductVector[i]; ++++ *VL_out_ptr += VL_dotProductVector[i]; ++++ } ++++ } ++++ ++++ lv_8sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_third(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++++ __m128i mult1, real_output, imag_output; ++++ ++++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++++ __m128i rearrange_sequence = _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ float VE_out_real = 0; ++++ float VE_out_imag = 0; ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ float VL_out_real = 0; ++++ float VL_out_imag = 0; ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ real_VE_code_acc = _mm_setzero_ps(); ++++ imag_VE_code_acc = _mm_setzero_ps(); ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ real_VL_code_acc = _mm_setzero_ps(); ++++ imag_VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ x_abs = _mm_abs_epi8 (x); ++++ ++++ y_aux = _mm_sign_epi8 (y, x); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, x); ++++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++++ ++++ //Get very early values ++++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ //Get very late values ++++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ VE_out_real += real_VE_dotProductVector[i]; ++++ VE_out_imag += imag_VE_dotProductVector[i]; ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ VL_out_real += real_VL_dotProductVector[i]; ++++ VL_out_imag += imag_VL_dotProductVector[i]; ++++ } ++++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_sse4_1_fourth(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++++ __m128i real_output, imag_output; ++++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++++ __m128i real_output_i_1, real_output_i_2, imag_output_i_1, imag_output_i_2, real_output_i32, imag_output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ __m128i minus128control; ++++ ++++ __m128i minus128 = _mm_set1_epi8 (-128); ++++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ float VE_out_real = 0; ++++ float VE_out_imag = 0; ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ float VL_out_real = 0; ++++ float VL_out_imag = 0; ++++ ++++ real_VE_code_acc = _mm_setzero_ps(); ++++ imag_VE_code_acc = _mm_setzero_ps(); ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ real_VL_code_acc = _mm_setzero_ps(); ++++ imag_VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ x_abs = _mm_abs_epi8 (x); ++++ ++++ y_aux = _mm_sign_epi8 (y, x); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (x_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, x); ++++ imag_output = _mm_maddubs_epi16 (x_abs, y_aux); ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++++ ++++ //Get very early values ++++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++++ y = _mm_sub_epi8 (y, minus128control); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++++ y = _mm_sub_epi8 (y, minus128control); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++++ y = _mm_sub_epi8 (y, minus128control); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++++ y = _mm_sub_epi8 (y, minus128control); ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, rearrange_sequence); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ //Get very late values ++++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ minus128control = _mm_cmpeq_epi8 (y, minus128); ++++ y = _mm_sub_epi8 (y, minus128control); ++++ ++++ ++++ y_aux = _mm_sign_epi8 (y, bb_signal_sample_aux); ++++ y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence); ++++ real_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ y_aux = _mm_shuffle_epi8 (y, _mm_set_epi8 (14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); ++++ y_aux = _mm_sign_epi8 (y_aux, bb_signal_sample_aux); ++++ imag_output = _mm_maddubs_epi16 (bb_signal_sample_aux_abs, y_aux); ++++ ++++ real_output_i_1 = _mm_cvtepi16_epi32(real_output); ++++ real_output = _mm_srli_si128 (real_output, 8); ++++ real_output_i_2 = _mm_cvtepi16_epi32(real_output); ++++ real_output_i32 = _mm_add_epi32 (real_output_i_1, real_output_i_2); ++++ real_output_ps = _mm_cvtepi32_ps(real_output_i32); ++++ ++++ imag_output_i_1 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output = _mm_srli_si128 (imag_output, 8); ++++ imag_output_i_2 = _mm_cvtepi16_epi32(imag_output); ++++ imag_output_i32 = _mm_add_epi32 (imag_output_i_1, imag_output_i_2); ++++ imag_output_ps = _mm_cvtepi32_ps(imag_output_i32); ++++ ++++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ VE_out_real += real_VE_dotProductVector[i]; ++++ VE_out_imag += imag_VE_dotProductVector[i]; ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ VL_out_real += real_VL_dotProductVector[i]; ++++ VL_out_imag += imag_VL_dotProductVector[i]; ++++ } ++++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++ ++++#ifdef LV_HAVE_GENERIC ++++#include ++++#include ++++ ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ ++++ ++++ lv_16sc_t VE_code_value; ++++ lv_16sc_t E_code_value; ++++ lv_16sc_t P_code_value; ++++ lv_16sc_t L_code_value; ++++ lv_16sc_t VL_code_value; ++++ lv_16sc_t bb_signal_sample; ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ VE_code_value = VE_code[i]; ++++ E_code_value = E_code[i]; ++++ P_code_value = P_code[i]; ++++ L_code_value = L_code[i]; ++++ VL_code_value = VL_code[i]; ++++ ++++ if(lv_creal(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++++ } ++++ if(lv_cimag(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++++ } ++++ ++++ if(lv_creal(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++++ } ++++ if(lv_cimag(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++++ } ++++ ++++ if(lv_creal(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++++ } ++++ if(lv_cimag(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++++ } ++++ ++++ if(lv_creal(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++++ } ++++ if(lv_cimag(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++++ } ++++ ++++ if(lv_creal(VL_code_value) == -128) ++++ { ++++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); ++++ } ++++ if(lv_cimag(VL_code_value) == -128) ++++ { ++++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); ++++ } ++++ ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); ++++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++++ } ++++} ++++ ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++//#ifdef LV_HAVE_GENERIC ++++//#include ++++//#include ++++//#include ++++// ++++//#ifndef MAX ++++//#define MAX(a,b) ((a) > (b) ? a : b) ++++//#endif ++++// ++++//#ifndef MIN ++++//#define MIN(a,b) ((a) < (b) ? a : b) ++++//#endif ++++// ++++///*! ++++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++// \param input The input signal input ++++// \param carrier The carrier signal input ++++// \param VE_code Very Early PRN code replica input ++++// \param E_code Early PRN code replica input ++++// \param P_code Prompt PRN code replica input ++++// \param L_code Late PRN code replica input ++++// \param VL_code Very Late PRN code replica input ++++// \param VE_out Very Early correlation output ++++// \param E_out Early correlation output ++++// \param P_out Prompt correlation output ++++// \param L_out Late correlation output ++++// \param VL_out Very Late correlation output ++++// \param num_points The number of complex values in vectors ++++// */ ++++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++//{ ++++// *VE_out = 0; ++++// *E_out = 0; ++++// *P_out = 0; ++++// *L_out = 0; ++++// *VL_out = 0; ++++// ++++// lv_16sc_t VE_out16; ++++// lv_16sc_t E_out16; ++++// lv_16sc_t P_out16; ++++// lv_16sc_t L_out16; ++++// lv_16sc_t VL_out16; ++++// ++++// int32_t max = 32767; ++++// int32_t min = -32768; ++++// ++++// int16_t real_real; ++++// int16_t imag_imag; ++++// int16_t real_imag; ++++// int16_t imag_real; ++++// int32_t out_real_32; ++++// int32_t out_imag_32; ++++// int16_t out_real_16; ++++// int16_t out_imag_16; ++++// int16_t aux1; ++++// int16_t aux2; ++++// ++++// ++++// lv_8sc_t bb_signal_sample = lv_cmake(0, 0); ++++// ++++// // perform very early, Early, Prompt, Late and very late correlation ++++// for(int i=0; i < num_points; ++i) ++++// { ++++// //Perform the carrier wipe-off ++++// bb_signal_sample = input[i] * carrier[i]; ++++// ++++// aux1 = (int16_t)lv_creal(bb_signal_sample); ++++// aux2 = (int16_t)lv_creal(VE_code[i]); ++++// real_real = aux1*aux2; ++++// aux1 = (int16_t)lv_cimag(bb_signal_sample); ++++// aux2 = (int16_t)lv_cimag(VE_code[i]); ++++// imag_imag = aux1*aux2; ++++// aux1 = (int16_t)lv_creal(bb_signal_sample); ++++// aux2 = (int16_t)lv_cimag(VE_code[i]); ++++// real_imag = aux1*aux2; ++++// aux1 = (int16_t)lv_cimag(bb_signal_sample); ++++// aux2 = (int16_t)lv_creal(VE_code[i]); ++++// imag_real = aux1*aux2; ++++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag; ++++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real; ++++// out_real_16 = MIN(MAX(out_real_32, min), max); ++++// out_imag_16 = MIN(MAX(out_imag_32, min), max); ++++// VE_out16 = lv_cmake(out_real_16, out_imag_16); ++++// ++++// ++++// ++++// if(lv_creal(L_code[i]) == -128) ++++// { ++++// int8_t* L_pointer = (int8_t*)&L_code[i]; ++++// *L_pointer = -127; ++++// } ++++// if(lv_cimag(L_code[i]) == -128) ++++// { ++++// int8_t* L_pointer = (int8_t*)&L_code[i]; ++++// L_pointer++; ++++// *L_pointer = -127; ++++// } ++++// aux1 = (int16_t)lv_creal(bb_signal_sample); ++++// aux2 = (int16_t)lv_creal(L_code[i]); ++++// real_real = aux1*aux2; ++++// aux1 = (int16_t)lv_cimag(bb_signal_sample); ++++// aux2 = (int16_t)lv_cimag(L_code[i]); ++++// imag_imag = aux1*aux2; ++++// aux1 = (int16_t)lv_creal(bb_signal_sample); ++++// aux2 = (int16_t)lv_cimag(L_code[i]); ++++// real_imag = aux1*aux2; ++++// aux1 = (int16_t)lv_cimag(bb_signal_sample); ++++// aux2 = (int16_t)lv_creal(L_code[i]); ++++// imag_real = aux1*aux2; ++++// out_real_32 = (int32_t)real_real - (int32_t)imag_imag; ++++// out_imag_32 = (int32_t)real_imag + (int32_t)imag_real; ++++// out_real_16 = MIN(MAX(out_real_32, min), max); ++++// out_imag_16 = MIN(MAX(out_imag_32, min), max); ++++// L_out16 = lv_cmake(out_real_16, out_imag_16); ++++// ++++// E_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)E_code[i]; ++++// P_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)P_code[i]; ++++// VL_out16 = (lv_16sc_t)bb_signal_sample * (lv_16sc_t)VL_code[i]; ++++// ++++// ++++// *VE_out += (lv_32fc_t) VE_out16; ++++// *E_out += (lv_32fc_t) E_out16; ++++// *P_out += (lv_32fc_t) P_out16; ++++// *L_out += (lv_32fc_t) L_out16; ++++// *VL_out += (lv_32fc_t) VL_out16; ++++// ++++// //error en la parte real de L con 32 muestras ++++// //*L_out = lv_cmake(12, 12); ++++// } ++++//} ++++// ++++//#endif /* LV_HAVE_GENERIC */ ++++ ++++//#ifdef LV_HAVE_GENERIC ++++///*! ++++// \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++// \param input The input signal input ++++// \param carrier The carrier signal input ++++// \param VE_code Very Early PRN code replica input ++++// \param E_code Early PRN code replica input ++++// \param P_code Prompt PRN code replica input ++++// \param L_code Late PRN code replica input ++++// \param VL_code Very Late PRN code replica input ++++// \param VE_out Very Early correlation output ++++// \param E_out Early correlation output ++++// \param P_out Prompt correlation output ++++// \param L_out Late correlation output ++++// \param VL_out Very Late correlation output ++++// \param num_points The number of complex values in vectors ++++// */ ++++//static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++//{ ++++// lv_8sc_t bb_signal_sample; ++++// ++++// bb_signal_sample = lv_cmake(0, 0); ++++// ++++// *VE_out = 0; ++++// *E_out = 0; ++++// *P_out = 0; ++++// *L_out = 0; ++++// *VL_out = 0; ++++// // perform very early, Early, Prompt, Late and very late correlation ++++// for(int i=0; i < num_points; ++i) ++++// { ++++// //Perform the carrier wipe-off ++++// bb_signal_sample = input[i] * carrier[i]; ++++// ++++// *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++++// *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++++// *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++++// *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++++// *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++++// } ++++//} ++++// ++++//#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5_u_H */ +++\ No newline at end of file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,772 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the ++++ * real part and 8 bits the imaginary part), and accumulates the result ++++ * in 32 bits single point values, returning float32 values: ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 16 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Very Early values are calculated by multiplying the input signal in BB by the ++++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Very Late values are calculated by multiplying the input signal in BB by the ++++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * ++++ * ------------------------------------------------------------------------- ++++ * Bits analysis ++++ * ++++ * input = 8 bits ++++ * carrier = 8 bits ++++ * XX_code = 8 bits ++++ * XX_out16 = 16 bits ++++ * bb_signal_sample = 8 bits ++++ * ++++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) ++++ * ++++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits) ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++++ __m128i real_output, imag_output; ++++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ __m128i minus128control; ++++ ++++ __m128i minus128 = _mm_set1_epi8 (-128); ++++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ float VE_out_real = 0; ++++ float VE_out_imag = 0; ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ float VL_out_real = 0; ++++ float VL_out_imag = 0; ++++ ++++ real_VE_code_acc = _mm_setzero_ps(); ++++ imag_VE_code_acc = _mm_setzero_ps(); ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ real_VL_code_acc = _mm_setzero_ps(); ++++ imag_VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ x_abs = _mm_abs_epi8 (x); ++++ ++++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++++ ++++ //Get very early values ++++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ //Get very late values ++++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ VE_out_real += real_VE_dotProductVector[i]; ++++ VE_out_imag += imag_VE_dotProductVector[i]; ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ VL_out_real += real_VL_dotProductVector[i]; ++++ VL_out_imag += imag_VL_dotProductVector[i]; ++++ } ++++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++++ } ++++ ++++ if(num_points%8!=0) ++++ { ++++ lv_16sc_t bb_signal_sample; ++++ lv_16sc_t VE_code_value; ++++ lv_16sc_t E_code_value; ++++ lv_16sc_t P_code_value; ++++ lv_16sc_t L_code_value; ++++ lv_16sc_t VL_code_value; ++++ ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ VE_code_value = *VE_code_ptr++; ++++ E_code_value = *E_code_ptr++; ++++ P_code_value = *P_code_ptr++; ++++ L_code_value = *L_code_ptr++; ++++ VL_code_value = *VL_code_ptr++; ++++ ++++ if(lv_creal(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++++ } ++++ if(lv_cimag(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++++ } ++++ ++++ if(lv_creal(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++++ } ++++ if(lv_cimag(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++++ } ++++ ++++ if(lv_creal(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++++ } ++++ if(lv_cimag(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++++ } ++++ ++++ if(lv_creal(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++++ } ++++ if(lv_cimag(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++++ } ++++ ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++++ } ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++#include ++++#include ++++ ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ ++++ lv_16sc_t VE_code_value; ++++ lv_16sc_t E_code_value; ++++ lv_16sc_t P_code_value; ++++ lv_16sc_t L_code_value; ++++ lv_16sc_t VL_code_value; ++++ lv_16sc_t bb_signal_sample; ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ VE_code_value = VE_code[i]; ++++ E_code_value = E_code[i]; ++++ P_code_value = P_code[i]; ++++ L_code_value = L_code[i]; ++++ VL_code_value = VL_code[i]; ++++ ++++ if(lv_creal(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++++ } ++++ if(lv_cimag(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++++ } ++++ ++++ if(lv_creal(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++++ } ++++ if(lv_cimag(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++++ } ++++ ++++ if(lv_creal(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++++ } ++++ if(lv_cimag(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++++ } ++++ ++++ if(lv_creal(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++++ } ++++ if(lv_cimag(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++++ } ++++ ++++ if(lv_creal(VL_code_value) == -128) ++++ { ++++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); ++++ } ++++ if(lv_cimag(VL_code_value) == -128) ++++ { ++++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); ++++ } ++++ ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); ++++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++++ __m128i real_output, imag_output; ++++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ __m128i minus128control; ++++ ++++ __m128i minus128 = _mm_set1_epi8 (-128); ++++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ float VE_out_real = 0; ++++ float VE_out_imag = 0; ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ float VL_out_real = 0; ++++ float VL_out_imag = 0; ++++ ++++ real_VE_code_acc = _mm_setzero_ps(); ++++ imag_VE_code_acc = _mm_setzero_ps(); ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ real_VL_code_acc = _mm_setzero_ps(); ++++ imag_VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_load_si128((__m128i*)input_ptr); ++++ y = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ x_abs = _mm_abs_epi8 (x); ++++ ++++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++++ ++++ //Get very early values ++++ y = _mm_load_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++++ ++++ //Get early values ++++ y = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ //Get very late values ++++ y = _mm_load_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++++ ++++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ VE_out_real += real_VE_dotProductVector[i]; ++++ VE_out_imag += imag_VE_dotProductVector[i]; ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ VL_out_real += real_VL_dotProductVector[i]; ++++ VL_out_imag += imag_VL_dotProductVector[i]; ++++ } ++++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++++ } ++++ ++++ if(num_points%8!=0) ++++ { ++++ lv_16sc_t bb_signal_sample; ++++ lv_16sc_t VE_code_value; ++++ lv_16sc_t E_code_value; ++++ lv_16sc_t P_code_value; ++++ lv_16sc_t L_code_value; ++++ lv_16sc_t VL_code_value; ++++ ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ VE_code_value = *VE_code_ptr++; ++++ E_code_value = *E_code_ptr++; ++++ P_code_value = *P_code_ptr++; ++++ L_code_value = *L_code_ptr++; ++++ VL_code_value = *VL_code_ptr++; ++++ ++++ if(lv_creal(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++++ } ++++ if(lv_cimag(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++++ } ++++ ++++ if(lv_creal(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++++ } ++++ if(lv_cimag(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++++ } ++++ ++++ if(lv_creal(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++++ } ++++ if(lv_cimag(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++++ } ++++ ++++ if(lv_creal(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++++ } ++++ if(lv_cimag(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++++ } ++++ ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++++ } ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++#include ++++#include ++++ ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ ++++ lv_16sc_t VE_code_value; ++++ lv_16sc_t E_code_value; ++++ lv_16sc_t P_code_value; ++++ lv_16sc_t L_code_value; ++++ lv_16sc_t VL_code_value; ++++ lv_16sc_t bb_signal_sample; ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ VE_code_value = VE_code[i]; ++++ E_code_value = E_code[i]; ++++ P_code_value = P_code[i]; ++++ L_code_value = L_code[i]; ++++ VL_code_value = VL_code[i]; ++++ ++++ if(lv_creal(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value)); ++++ } ++++ if(lv_cimag(VE_code_value) == -128) ++++ { ++++ VE_code_value = lv_cmake(lv_creal(VE_code_value), -127); ++++ } ++++ ++++ if(lv_creal(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(-127, lv_cimag(E_code_value)); ++++ } ++++ if(lv_cimag(E_code_value) == -128) ++++ { ++++ E_code_value = lv_cmake(lv_creal(E_code_value), -127); ++++ } ++++ ++++ if(lv_creal(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(-127, lv_cimag(P_code_value)); ++++ } ++++ if(lv_cimag(P_code_value) == -128) ++++ { ++++ P_code_value = lv_cmake(lv_creal(P_code_value), -127); ++++ } ++++ ++++ if(lv_creal(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(-127, lv_cimag(L_code_value)); ++++ } ++++ if(lv_cimag(L_code_value) == -128) ++++ { ++++ L_code_value = lv_cmake(lv_creal(L_code_value), -127); ++++ } ++++ ++++ if(lv_creal(VL_code_value) == -128) ++++ { ++++ VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value)); ++++ } ++++ if(lv_cimag(VL_code_value) == -128) ++++ { ++++ VL_code_value = lv_cmake(lv_creal(VL_code_value), -127); ++++ } ++++ ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value); ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code_value); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code_value); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code_value); ++++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */ +++\ No newline at end of file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,554 @@ ++++/*! ++++ * \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h ++++ * \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between —7 and 7 and "XX_code inputs" must be values between —127 and 127. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that performs the carrier wipe-off mixing and the ++++ * Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the ++++ * real part and 8 bits the imaginary part), and accumulates the result ++++ * in 32 bits single point values, returning float32 values: ++++ * - The carrier wipe-off is done by multiplying the input signal by the ++++ * carrier (multiplication of 16 bits vectors) It returns the input ++++ * signal in base band (BB) ++++ * - Very Early values are calculated by multiplying the input signal in BB by the ++++ * very early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Early values are calculated by multiplying the input signal in BB by the ++++ * early code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Prompt values are calculated by multiplying the input signal in BB by the ++++ * prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Late values are calculated by multiplying the input signal in BB by the ++++ * late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * - Very Late values are calculated by multiplying the input signal in BB by the ++++ * very late code (multiplication of 16 bits vectors), accumulating the results into float32 values ++++ * ++++ * ------------------------------------------------------------------------- ++++ * Bits analysis ++++ * ++++ * input = 8 bits ++++ * carrier = 8 bits ++++ * XX_code = 8 bits ++++ * XX_out16 = 16 bits ++++ * bb_signal_sample = 8 bits ++++ * ++++ * bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between —7 and 7 to avoid overflow (3 bits) ++++ * ++++ * XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between —127 and 127 to avoid overflow (7 bits) ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++++ __m128i real_output, imag_output; ++++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ float VE_out_real = 0; ++++ float VE_out_imag = 0; ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ float VL_out_real = 0; ++++ float VL_out_imag = 0; ++++ ++++ real_VE_code_acc = _mm_setzero_ps(); ++++ imag_VE_code_acc = _mm_setzero_ps(); ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ real_VL_code_acc = _mm_setzero_ps(); ++++ imag_VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_lddqu_si128((__m128i*)input_ptr); ++++ y = _mm_lddqu_si128((__m128i*)carrier_ptr); ++++ ++++ x_abs = _mm_abs_epi8 (x); ++++ ++++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++++ ++++ //Get very early values ++++ y = _mm_lddqu_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++++ ++++ //Get early values ++++ y = _mm_lddqu_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_lddqu_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y = _mm_lddqu_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ //Get very late values ++++ y = _mm_lddqu_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++++ ++++ _mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++++ _mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ VE_out_real += real_VE_dotProductVector[i]; ++++ VE_out_imag += imag_VE_dotProductVector[i]; ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ VL_out_real += real_VL_dotProductVector[i]; ++++ VL_out_imag += imag_VL_dotProductVector[i]; ++++ } ++++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++#include ++++#include ++++ ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ ++++ lv_16sc_t bb_signal_sample; ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */ ++++ ++++ ++++#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H ++++#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H ++++ ++++#include ++++#include ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE4_1 ++++#include "smmintrin.h" ++++#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h" ++++#include "CommonMacros/CommonMacros.h" ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;; ++++ __m128i real_output, imag_output; ++++ __m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc; ++++ __m128i input_i_1, input_i_2, output_i32; ++++ __m128 real_output_ps, imag_output_ps; ++++ ++++ __m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1); ++++ __m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1); ++++ __m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ ++++ const lv_8sc_t* input_ptr = input; ++++ const lv_8sc_t* carrier_ptr = carrier; ++++ ++++ const lv_8sc_t* VE_code_ptr = VE_code; ++++ lv_32fc_t* VE_out_ptr = VE_out; ++++ const lv_8sc_t* E_code_ptr = E_code; ++++ lv_32fc_t* E_out_ptr = E_out; ++++ const lv_8sc_t* P_code_ptr = P_code; ++++ lv_32fc_t* P_out_ptr = P_out; ++++ const lv_8sc_t* L_code_ptr = L_code; ++++ lv_32fc_t* L_out_ptr = L_out; ++++ const lv_8sc_t* VL_code_ptr = VL_code; ++++ lv_32fc_t* VL_out_ptr = VL_out; ++++ ++++ float VE_out_real = 0; ++++ float VE_out_imag = 0; ++++ float E_out_real = 0; ++++ float E_out_imag = 0; ++++ float P_out_real = 0; ++++ float P_out_imag = 0; ++++ float L_out_real = 0; ++++ float L_out_imag = 0; ++++ float VL_out_real = 0; ++++ float VL_out_imag = 0; ++++ ++++ real_VE_code_acc = _mm_setzero_ps(); ++++ imag_VE_code_acc = _mm_setzero_ps(); ++++ real_E_code_acc = _mm_setzero_ps(); ++++ imag_E_code_acc = _mm_setzero_ps(); ++++ real_P_code_acc = _mm_setzero_ps(); ++++ imag_P_code_acc = _mm_setzero_ps(); ++++ real_L_code_acc = _mm_setzero_ps(); ++++ imag_L_code_acc = _mm_setzero_ps(); ++++ real_VL_code_acc = _mm_setzero_ps(); ++++ imag_VL_code_acc = _mm_setzero_ps(); ++++ ++++ if (sse_iters>0) ++++ { ++++ for(int number = 0;number < sse_iters; number++){ ++++ ++++ //Perform the carrier wipe-off ++++ x = _mm_load_si128((__m128i*)input_ptr); ++++ y = _mm_load_si128((__m128i*)carrier_ptr); ++++ ++++ x_abs = _mm_abs_epi8 (x); ++++ ++++ CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output) ++++ ++++ imag_output = _mm_slli_si128 (imag_output, 1); ++++ bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1); ++++ bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux); ++++ ++++ //Get very early values ++++ y = _mm_load_si128((__m128i*)VE_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps); ++++ imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps); ++++ ++++ //Get early values ++++ y = _mm_load_si128((__m128i*)E_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps); ++++ imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps); ++++ ++++ //Get prompt values ++++ y = _mm_load_si128((__m128i*)P_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps); ++++ imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps); ++++ ++++ //Get late values ++++ y = _mm_load_si128((__m128i*)L_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps); ++++ imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps); ++++ ++++ //Get very late values ++++ y = _mm_load_si128((__m128i*)VL_code_ptr); ++++ ++++ CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps) ++++ ++++ real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps); ++++ imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps); ++++ ++++ input_ptr += 8; ++++ carrier_ptr += 8; ++++ VE_code_ptr += 8; ++++ E_code_ptr += 8; ++++ P_code_ptr += 8; ++++ L_code_ptr += 8; ++++ VL_code_ptr += 8; ++++ } ++++ ++++ __VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4]; ++++ __VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4]; ++++ ++++ _mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector ++++ _mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector ++++ ++++ for (int i = 0; i<4; ++i) ++++ { ++++ VE_out_real += real_VE_dotProductVector[i]; ++++ VE_out_imag += imag_VE_dotProductVector[i]; ++++ E_out_real += real_E_dotProductVector[i]; ++++ E_out_imag += imag_E_dotProductVector[i]; ++++ P_out_real += real_P_dotProductVector[i]; ++++ P_out_imag += imag_P_dotProductVector[i]; ++++ L_out_real += real_L_dotProductVector[i]; ++++ L_out_imag += imag_L_dotProductVector[i]; ++++ VL_out_real += real_VL_dotProductVector[i]; ++++ VL_out_imag += imag_VL_dotProductVector[i]; ++++ } ++++ *VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag); ++++ *E_out_ptr = lv_cmake(E_out_real, E_out_imag); ++++ *P_out_ptr = lv_cmake(P_out_real, P_out_imag); ++++ *L_out_ptr = lv_cmake(L_out_real, L_out_imag); ++++ *VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag); ++++ } ++++ ++++ lv_16sc_t bb_signal_sample; ++++ for(int i=0; i < num_points%8; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = (*input_ptr++) * (*carrier_ptr++); ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++)); ++++ *E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++)); ++++ *P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++)); ++++ *L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++)); ++++ *VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++)); ++++ } ++++} ++++#endif /* LV_HAVE_SSE4_1 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++#include ++++#include ++++ ++++/*! ++++ \brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation ++++ \param input The input signal input ++++ \param carrier The carrier signal input ++++ \param VE_code Very Early PRN code replica input ++++ \param E_code Early PRN code replica input ++++ \param P_code Prompt PRN code replica input ++++ \param L_code Late PRN code replica input ++++ \param VL_code Very Late PRN code replica input ++++ \param VE_out Very Early correlation output ++++ \param E_out Early correlation output ++++ \param P_out Prompt correlation output ++++ \param L_out Late correlation output ++++ \param VL_out Very Late correlation output ++++ \param num_points The number of complex values in vectors ++++ */ ++++static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points) ++++{ ++++ *VE_out = 0; ++++ *E_out = 0; ++++ *P_out = 0; ++++ *L_out = 0; ++++ *VL_out = 0; ++++ ++++ lv_16sc_t bb_signal_sample; ++++ ++++ for(int i=0; i < num_points; ++i) ++++ { ++++ //Perform the carrier wipe-off ++++ bb_signal_sample = input[i] * carrier[i]; ++++ // Now get very early, early, prompt, late and very late values for each ++++ *VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]); ++++ *E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]); ++++ *P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]); ++++ *L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]); ++++ *VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */ +++\ No newline at end of file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,210 @@ ++++/*! ++++ * \file volk_gnsssdr_8u_x2_multiply_8u.h ++++ * \brief Volk protokernel: multiplies unsigned char values ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that multiplies unsigned char values (8 bits data) ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H ++++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H ++++ ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++#include ++++/*! ++++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++++ \param cChar The unsigned char where the results will be stored ++++ \param aChar One of the unsigned char to be multiplied ++++ \param bChar One of the unsigned char to be multiplied ++++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++++ */ ++++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; ++++ unsigned char* c = cChar; ++++ const unsigned char* a = aChar; ++++ const unsigned char* b = bChar; ++++ ++++ for(int number = 0;number < sse_iters; number++){ ++++ x = _mm_lddqu_si128((__m128i*)a); ++++ y = _mm_lddqu_si128((__m128i*)b); ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ x1 = _mm_srli_si128 (x, 1); ++++ x1 = _mm_and_si128 (x1, mult1); ++++ x2 = _mm_and_si128 (x, mult1); ++++ ++++ y1 = _mm_srli_si128 (y, 1); ++++ y1 = _mm_and_si128 (y1, mult1); ++++ y2 = _mm_and_si128 (y, mult1); ++++ ++++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1); ++++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2); ++++ ++++ tmp = _mm_and_si128 (x1_mult_y1, mult1); ++++ tmp1 = _mm_slli_si128 (tmp, 1); ++++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1); ++++ totalc = _mm_or_si128 (tmp1, tmp2); ++++ ++++ _mm_storeu_si128((__m128i*)c, totalc); ++++ ++++ a += 16; ++++ b += 16; ++++ c += 16; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 16); ++i) ++++ { ++++ *c++ = (*a++) * (*b++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE3 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++++ \param cChar The unsigned char where the results will be stored ++++ \param aChar One of the unsigned char to be multiplied ++++ \param bChar One of the unsigned char to be multiplied ++++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++++ */ ++++static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ ++++ unsigned char* cPtr = cChar; ++++ const unsigned char* aPtr = aChar; ++++ const unsigned char* bPtr = bChar; ++++ ++++ for(int number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) * (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H ++++#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H ++++ ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_SSE3 ++++#include ++++#include ++++/*! ++++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++++ \param cChar The unsigned char where the results will be stored ++++ \param aChar One of the unsigned char to be multiplied ++++ \param bChar One of the unsigned char to be multiplied ++++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++++ */ ++++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ ++++ ++++ const unsigned int sse_iters = num_points / 16; ++++ ++++ __m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc; ++++ unsigned char* c = cChar; ++++ const unsigned char* a = aChar; ++++ const unsigned char* b = bChar; ++++ ++++ for(int number = 0;number < sse_iters; number++){ ++++ x = _mm_load_si128((__m128i*)a); ++++ y = _mm_load_si128((__m128i*)b); ++++ ++++ mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); ++++ x1 = _mm_srli_si128 (x, 1); ++++ x1 = _mm_and_si128 (x1, mult1); ++++ x2 = _mm_and_si128 (x, mult1); ++++ ++++ y1 = _mm_srli_si128 (y, 1); ++++ y1 = _mm_and_si128 (y1, mult1); ++++ y2 = _mm_and_si128 (y, mult1); ++++ ++++ x1_mult_y1 = _mm_mullo_epi16 (x1, y1); ++++ x2_mult_y2 = _mm_mullo_epi16 (x2, y2); ++++ ++++ tmp = _mm_and_si128 (x1_mult_y1, mult1); ++++ tmp1 = _mm_slli_si128 (tmp, 1); ++++ tmp2 = _mm_and_si128 (x2_mult_y2, mult1); ++++ totalc = _mm_or_si128 (tmp1, tmp2); ++++ ++++ _mm_store_si128((__m128i*)c, totalc); ++++ ++++ a += 16; ++++ b += 16; ++++ c += 16; ++++ } ++++ ++++ for (int i = 0; i<(num_points % 16); ++i) ++++ { ++++ *c++ = (*a++) * (*b++); ++++ } ++++} ++++#endif /* LV_HAVE_SSE */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++++ \param cChar The unsigned char where the results will be stored ++++ \param aChar One of the unsigned char to be multiplied ++++ \param bChar One of the unsigned char to be multiplied ++++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++++ */ ++++static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){ ++++ unsigned char* cPtr = cChar; ++++ const unsigned char* aPtr = aChar; ++++ const unsigned char* bPtr = bChar; ++++ ++++ for(int number = 0; number < num_points; number++){ ++++ *cPtr++ = (*aPtr++) * (*bPtr++); ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++ ++++#ifdef LV_HAVE_ORC ++++/*! ++++ \brief Multiplies the two input unsigned char values and stores their results in the third unisgned char ++++ \param cChar The unsigned char where the results will be stored ++++ \param aChar One of the unsigned char to be multiplied ++++ \param bChar One of the unsigned char to be multiplied ++++ \param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar ++++ */ ++++extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points); ++++static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){ ++++ volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points); ++++} ++++#endif /* LV_HAVE_ORC */ ++++ ++++#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H */ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h +++--- /Users/andres/Desktop/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/kernels/volk_gnsssdr/volk_gnsssdr_s32f_x2_update_local_carrier_32fc.h 2014-10-17 01:53:55.000000000 +0200 +++@@ -0,0 +1,866 @@ ++++/*! ++++ * \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc ++++ * \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. ++++ * \authors
    ++++ *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++ *
++++ * ++++ * Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia. ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2007 Julien Pommier ++++ * ++++ * This software is provided 'as-is', without any express or implied ++++ * warranty. In no event will the authors be held liable for any damages ++++ * arising from the use of this software. ++++ * ++++ * Permission is granted to anyone to use this software for any purpose, ++++ * including commercial applications, and to alter it and redistribute it ++++ * freely, subject to the following restrictions: ++++ * ++++ * 1. The origin of this software must not be misrepresented; you must not ++++ * claim that you wrote the original software. If you use this software ++++ * in a product, an acknowledgment in the product documentation would be ++++ * appreciated but is not required. ++++ * 2. Altered source versions must be plainly marked as such, and must not be ++++ * misrepresented as being the original software. ++++ * 3. This notice may not be removed or altered from any source distribution. ++++ * ++++ *(this is the zlib license) ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2012 Giovanni Garberoglio ++++ * Interdisciplinary Laboratory for Computational Science (LISC) ++++ * Fondazione Bruno Kessler and University of Trento ++++ * via Sommarive, 18 ++++ * I-38123 Trento (Italy) ++++ * ++++ * ------------------------------------------------------------------------- ++++ * ++++ * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++ * ++++ * GNSS-SDR is a software defined Global Navigation ++++ * Satellite Systems receiver ++++ * ++++ * This file is part of GNSS-SDR. ++++ * ++++ * GNSS-SDR is free software: you can redistribute it and/or modify ++++ * it under the terms of the GNU General Public License as published by ++++ * the Free Software Foundation, either version 3 of the License, or ++++ * at your option) any later version. ++++ * ++++ * GNSS-SDR is distributed in the hope that it will be useful, ++++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++ * GNU General Public License for more details. ++++ * ++++ * You should have received a copy of the GNU General Public License ++++ * along with GNSS-SDR. If not, see . ++++ * ++++ * ------------------------------------------------------------------------- ++++ */ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H ++++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++++ ++++// float* pointer1 = (float*)&phase_rad_init; ++++// *pointer1 = 0; ++++// float* pointer2 = (float*)&phase_step_rad; ++++// *pointer2 = 0.5; ++++ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); ++++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); ++++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); ++++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f); ++++ __m128i _pi32avx_1 = _mm_set1_epi32(1); ++++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1); ++++ __m128i _pi32avx_2 = _mm_set1_epi32(2); ++++ __m128i _pi32avx_4 = _mm_set1_epi32(4); ++++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI ++++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f); ++++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f); ++++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f); ++++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f); ++++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f); ++++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); ++++ __m256 _ps256_1 = _mm256_set1_ps(1.f); ++++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); ++++ ++++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); ++++ ++++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; ++++ __m256 xmm1, xmm2, xmm3, sign_bit_sin; ++++ __m256i imm0, imm2, imm4; ++++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; ++++ __VOLK_ATTR_ALIGNED(32) float sin_value[8]; ++++ __VOLK_ATTR_ALIGNED(32) float cos_value[8]; ++++ ++++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); ++++ ++++ for(int i = 0; i < sse_iters; i++) ++++ { ++++ ++++ x = phase_rad_array; ++++ ++++ /* extract the sign bit (upper one) */ ++++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); ++++ ++++ /* take the absolute value */ ++++ x = _mm256_xor_ps(x, sign_bit_sin); ++++ ++++ /* scale by 4/Pi */ ++++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI); ++++ ++++ /* we use SSE2 routines to perform the integer ops */ ++++ ++++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); ++++ y = _mm256_cvttps_epi32(y); ++++ imm2_1 = _mm256_extractf128_ps (y, 0); ++++ imm2_2 = _mm256_extractf128_ps (y, 1); ++++ ++++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); ++++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); ++++ ++++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); ++++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); ++++ ++++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); ++++ //_mm256_set_m128i not defined in some versions of immintrin.h ++++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); ++++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); ++++ ++++ y = _mm256_cvtepi32_ps(imm2); ++++ ++++ imm4_1 = imm2_1; ++++ imm4_2 = imm2_2; ++++ ++++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); ++++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); ++++ ++++ imm0_1 = _mm_slli_epi32(imm0_1, 29); ++++ imm0_2 = _mm_slli_epi32(imm0_2, 29); ++++ ++++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); ++++ //_mm256_set_m128i not defined in some versions of immintrin.h ++++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1); ++++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); ++++ ++++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); ++++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); ++++ ++++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); ++++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); ++++ ++++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); ++++ //_mm256_set_m128i not defined in some versions of immintrin.h ++++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); ++++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); ++++ ++++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0); ++++ poly_mask = _mm256_castsi256_ps(imm2); ++++ ++++ /* The magic pass: "Extended precision modular arithmetic" ++++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ ++++ xmm1 = _ps256_minus_cephes_DP1; ++++ xmm2 = _ps256_minus_cephes_DP2; ++++ xmm3 = _ps256_minus_cephes_DP3; ++++ xmm1 = _mm256_mul_ps(y, xmm1); ++++ xmm2 = _mm256_mul_ps(y, xmm2); ++++ xmm3 = _mm256_mul_ps(y, xmm3); ++++ x = _mm256_add_ps(x, xmm1); ++++ x = _mm256_add_ps(x, xmm2); ++++ x = _mm256_add_ps(x, xmm3); ++++ ++++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); ++++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); ++++ ++++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); ++++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); ++++ ++++ imm4_1 = _mm_slli_epi32(imm4_1, 29); ++++ imm4_2 = _mm_slli_epi32(imm4_2, 29); ++++ ++++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); ++++ //_mm256_set_m128i not defined in some versions of immintrin.h ++++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1); ++++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); ++++ ++++ sign_bit_cos = _mm256_castsi256_ps(imm4); ++++ ++++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); ++++ ++++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ ++++ z = _mm256_mul_ps(x,x); ++++ y = _ps256_coscof_p0; ++++ ++++ y = _mm256_mul_ps(y, z); ++++ y = _mm256_add_ps(y, _ps256_coscof_p1); ++++ y = _mm256_mul_ps(y, z); ++++ y = _mm256_add_ps(y, _ps256_coscof_p2); ++++ y = _mm256_mul_ps(y, z); ++++ y = _mm256_mul_ps(y, z); ++++ tmp = _mm256_mul_ps(z, _ps256_0p5); ++++ y = _mm256_sub_ps(y, tmp); ++++ y = _mm256_add_ps(y, _ps256_1); ++++ ++++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ ++++ ++++ y2 = _ps256_sincof_p0; ++++ y2 = _mm256_mul_ps(y2, z); ++++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1); ++++ y2 = _mm256_mul_ps(y2, z); ++++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2); ++++ y2 = _mm256_mul_ps(y2, z); ++++ y2 = _mm256_mul_ps(y2, x); ++++ y2 = _mm256_add_ps(y2, x); ++++ ++++ /* select the correct result from the two polynoms */ ++++ xmm3 = poly_mask; ++++ ysin2 = _mm256_and_ps(xmm3, y2); ++++ ysin1 = _mm256_andnot_ps(xmm3, y); ++++ y2 = _mm256_sub_ps(y2,ysin2); ++++ y = _mm256_sub_ps(y, ysin1); ++++ ++++ xmm1 = _mm256_add_ps(ysin1,ysin2); ++++ xmm2 = _mm256_add_ps(y,y2); ++++ ++++ /* update the sign */ ++++ s = _mm256_xor_ps(xmm1, sign_bit_sin); ++++ c = _mm256_xor_ps(xmm2, sign_bit_cos); ++++ ++++ //GNSS-SDR needs to return -sin ++++ s = _mm256_xor_ps(s, _ps256_sign_mask); ++++ ++++ _mm256_storeu_ps ((float*)sin_value, s); ++++ _mm256_storeu_ps ((float*)cos_value, c); ++++ ++++ for(int i = 0; i < 8; i++) ++++ { ++++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); ++++ } ++++ d_carr_sign += 8; ++++ ++++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); ++++ } ++++ ++++ if (num_points%8!=0) ++++ { ++++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; ++++ _mm256_storeu_si256 ((float*)phase_rad_store, phase_rad_array); ++++ ++++ float phase_rad = phase_rad_store[0]; ++++ ++++ for(int i = 0; i < num_points%8; i++) ++++ { ++++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++++ d_carr_sign++; ++++ phase_rad += phase_step_rad; ++++ } ++++ } ++++} ++++#endif /* LV_HAVE_AVX */ ++++ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++*/ ++++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++++ ++++// float* pointer1 = (float*)&phase_rad_init; ++++// *pointer1 = 0; ++++// float* pointer2 = (float*)&phase_step_rad; ++++// *pointer2 = 0.5; ++++ ++++ const unsigned int sse_iters = num_points / 4; ++++ ++++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); ++++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); ++++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); ++++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f); ++++ __m128i _pi32_1 = _mm_set1_epi32(1); ++++ __m128i _pi32_inv1 = _mm_set1_epi32(~1); ++++ __m128i _pi32_2 = _mm_set1_epi32(2); ++++ __m128i _pi32_4 = _mm_set1_epi32(4); ++++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI ++++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f); ++++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f); ++++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f); ++++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f); ++++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f); ++++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); ++++ __m128 _ps_1 = _mm_set1_ps(1.f); ++++ __m128 _ps_0p5 = _mm_set1_ps(0.5f); ++++ ++++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); ++++ ++++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; ++++ __m128 xmm1, xmm2, xmm3, sign_bit_sin; ++++ __m128i emm0, emm2, emm4; ++++ __VOLK_ATTR_ALIGNED(16) float sin_value[4]; ++++ __VOLK_ATTR_ALIGNED(16) float cos_value[4]; ++++ ++++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); ++++ ++++ for(int i = 0; i < sse_iters; i++) ++++ { ++++ x = phase_rad_array; ++++ ++++ /* extract the sign bit (upper one) */ ++++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); ++++ ++++ /* take the absolute value */ ++++ x = _mm_xor_ps(x, sign_bit_sin); ++++ ++++ /* scale by 4/Pi */ ++++ y = _mm_mul_ps(x, _ps_cephes_FOPI); ++++ ++++ /* store the integer part of y in emm2 */ ++++ emm2 = _mm_cvttps_epi32(y); ++++ ++++ /* j=(j+1) & (~1) (see the cephes sources) */ ++++ emm2 = _mm_add_epi32(emm2, _pi32_1); ++++ emm2 = _mm_and_si128(emm2, _pi32_inv1); ++++ y = _mm_cvtepi32_ps(emm2); ++++ ++++ emm4 = emm2; ++++ ++++ /* get the swap sign flag for the sine */ ++++ emm0 = _mm_and_si128(emm2, _pi32_4); ++++ emm0 = _mm_slli_epi32(emm0, 29); ++++ swap_sign_bit_sin = _mm_castsi128_ps(emm0); ++++ ++++ /* get the polynom selection mask for the sine*/ ++++ emm2 = _mm_and_si128(emm2, _pi32_2); ++++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); ++++ poly_mask = _mm_castsi128_ps(emm2); ++++ ++++ /* The magic pass: "Extended precision modular arithmetic" ++++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ ++++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); ++++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); ++++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); ++++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); ++++ ++++ emm4 = _mm_sub_epi32(emm4, _pi32_2); ++++ emm4 = _mm_andnot_si128(emm4, _pi32_4); ++++ emm4 = _mm_slli_epi32(emm4, 29); ++++ sign_bit_cos = _mm_castsi128_ps(emm4); ++++ ++++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); ++++ ++++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ ++++ z = _mm_mul_ps(x,x); ++++ y = _ps_coscof_p0; ++++ y = _mm_mul_ps(y, z); ++++ y = _mm_add_ps(y, _ps_coscof_p1); ++++ y = _mm_mul_ps(y, z); ++++ y = _mm_add_ps(y, _ps_coscof_p2); ++++ y = _mm_mul_ps(y, _mm_mul_ps(z, z)); ++++ tmp = _mm_mul_ps(z, _ps_0p5); ++++ y = _mm_sub_ps(y, tmp); ++++ y = _mm_add_ps(y, _ps_1); ++++ ++++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ ++++ y2 = _ps_sincof_p0; ++++ y2 = _mm_mul_ps(y2, z); ++++ y2 = _mm_add_ps(y2, _ps_sincof_p1); ++++ y2 = _mm_mul_ps(y2, z); ++++ y2 = _mm_add_ps(y2, _ps_sincof_p2); ++++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); ++++ y2 = _mm_add_ps(y2, x); ++++ ++++ /* select the correct result from the two polynoms */ ++++ xmm3 = poly_mask; ++++ ysin2 = _mm_and_ps(xmm3, y2); ++++ ysin1 = _mm_andnot_ps(xmm3, y); ++++ y2 = _mm_sub_ps(y2,ysin2); ++++ y = _mm_sub_ps(y, ysin1); ++++ ++++ xmm1 = _mm_add_ps(ysin1,ysin2); ++++ xmm2 = _mm_add_ps(y,y2); ++++ ++++ /* update the sign */ ++++ s = _mm_xor_ps(xmm1, sign_bit_sin); ++++ c = _mm_xor_ps(xmm2, sign_bit_cos); ++++ ++++ //GNSS-SDR needs to return -sin ++++ s = _mm_xor_ps(s, _ps_sign_mask); ++++ ++++ _mm_storeu_ps ((float*)sin_value, s); ++++ _mm_storeu_ps ((float*)cos_value, c); ++++ ++++ for(int i = 0; i < 4; i++) ++++ { ++++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); ++++ } ++++ d_carr_sign += 4; ++++ ++++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); ++++ } ++++ ++++ if (num_points%4!=0) ++++ { ++++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; ++++ _mm_storeu_si128 ((__m128i*)phase_rad_store, phase_rad_array); ++++ ++++ float phase_rad = phase_rad_store[0]; ++++ ++++ for(int i = 0; i < num_points%4; i++) ++++ { ++++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++++ d_carr_sign++; ++++ phase_rad += phase_step_rad; ++++ } ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++*/ ++++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++++ ++++// float* pointer1 = (float*)&phase_rad_init; ++++// *pointer1 = 0; ++++// float* pointer2 = (float*)&phase_step_rad; ++++// *pointer2 = 0.5; ++++ ++++ float phase_rad = phase_rad_init; ++++ for(int i = 0; i < num_points; i++) ++++ { ++++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++++ d_carr_sign++; ++++ phase_rad += phase_step_rad; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */ ++++ ++++ ++++#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H ++++#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H ++++ ++++#include ++++#include ++++#include ++++ ++++#ifdef LV_HAVE_AVX ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++++ ++++ // float* pointer1 = (float*)&phase_rad_init; ++++ // *pointer1 = 0; ++++ // float* pointer2 = (float*)&phase_step_rad; ++++ // *pointer2 = 0.5; ++++ ++++ const unsigned int sse_iters = num_points / 8; ++++ ++++ __m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f); ++++ __m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f); ++++ __m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f); ++++ __m256 _ps256_sign_mask = _mm256_set1_ps(-0.f); ++++ __m128i _pi32avx_1 = _mm_set1_epi32(1); ++++ __m128i _pi32avx_inv1 = _mm_set1_epi32(~1); ++++ __m128i _pi32avx_2 = _mm_set1_epi32(2); ++++ __m128i _pi32avx_4 = _mm_set1_epi32(4); ++++ __m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI ++++ __m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f); ++++ __m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f); ++++ __m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f); ++++ __m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f); ++++ __m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f); ++++ __m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f); ++++ __m256 _ps256_1 = _mm256_set1_ps(1.f); ++++ __m256 _ps256_0p5 = _mm256_set1_ps(0.5f); ++++ ++++ __m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad); ++++ ++++ __m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; ++++ __m256 xmm1, xmm2, xmm3, sign_bit_sin; ++++ __m256i imm0, imm2, imm4; ++++ __m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2; ++++ __VOLK_ATTR_ALIGNED(32) float sin_value[8]; ++++ __VOLK_ATTR_ALIGNED(32) float cos_value[8]; ++++ ++++ phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); ++++ ++++ for(int i = 0; i < sse_iters; i++) ++++ { ++++ ++++ x = phase_rad_array; ++++ ++++ /* extract the sign bit (upper one) */ ++++ sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask); ++++ ++++ /* take the absolute value */ ++++ x = _mm256_xor_ps(x, sign_bit_sin); ++++ ++++ /* scale by 4/Pi */ ++++ y = _mm256_mul_ps(x, _ps256_cephes_FOPI); ++++ ++++ /* we use SSE2 routines to perform the integer ops */ ++++ ++++ //COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); ++++ y = _mm256_cvttps_epi32(y); ++++ imm2_1 = _mm256_extractf128_ps (y, 0); ++++ imm2_2 = _mm256_extractf128_ps (y, 1); ++++ ++++ imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1); ++++ imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1); ++++ ++++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1); ++++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1); ++++ ++++ //COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); ++++ //_mm256_set_m128i not defined in some versions of immintrin.h ++++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); ++++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); ++++ ++++ y = _mm256_cvtepi32_ps(imm2); ++++ ++++ imm4_1 = imm2_1; ++++ imm4_2 = imm2_2; ++++ ++++ imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4); ++++ imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4); ++++ ++++ imm0_1 = _mm_slli_epi32(imm0_1, 29); ++++ imm0_2 = _mm_slli_epi32(imm0_2, 29); ++++ ++++ //COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); ++++ //_mm256_set_m128i not defined in some versions of immintrin.h ++++ //imm0 = _mm256_set_m128i (imm0_2, imm0_1); ++++ imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1); ++++ ++++ imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2); ++++ imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2); ++++ ++++ imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); ++++ imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); ++++ ++++ //COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); ++++ //_mm256_set_m128i not defined in some versions of immintrin.h ++++ //imm2 = _mm256_set_m128i (imm2_2, imm2_1); ++++ imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1); ++++ ++++ swap_sign_bit_sin = _mm256_castsi256_ps(imm0); ++++ poly_mask = _mm256_castsi256_ps(imm2); ++++ ++++ /* The magic pass: "Extended precision modular arithmetic" ++++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ ++++ xmm1 = _ps256_minus_cephes_DP1; ++++ xmm2 = _ps256_minus_cephes_DP2; ++++ xmm3 = _ps256_minus_cephes_DP3; ++++ xmm1 = _mm256_mul_ps(y, xmm1); ++++ xmm2 = _mm256_mul_ps(y, xmm2); ++++ xmm3 = _mm256_mul_ps(y, xmm3); ++++ x = _mm256_add_ps(x, xmm1); ++++ x = _mm256_add_ps(x, xmm2); ++++ x = _mm256_add_ps(x, xmm3); ++++ ++++ imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2); ++++ imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2); ++++ ++++ imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4); ++++ imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4); ++++ ++++ imm4_1 = _mm_slli_epi32(imm4_1, 29); ++++ imm4_2 = _mm_slli_epi32(imm4_2, 29); ++++ ++++ //COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); ++++ //_mm256_set_m128i not defined in some versions of immintrin.h ++++ //imm4 = _mm256_set_m128i (imm4_2, imm4_1); ++++ imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1); ++++ ++++ sign_bit_cos = _mm256_castsi256_ps(imm4); ++++ ++++ sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); ++++ ++++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ ++++ z = _mm256_mul_ps(x,x); ++++ y = _ps256_coscof_p0; ++++ ++++ y = _mm256_mul_ps(y, z); ++++ y = _mm256_add_ps(y, _ps256_coscof_p1); ++++ y = _mm256_mul_ps(y, z); ++++ y = _mm256_add_ps(y, _ps256_coscof_p2); ++++ y = _mm256_mul_ps(y, z); ++++ y = _mm256_mul_ps(y, z); ++++ tmp = _mm256_mul_ps(z, _ps256_0p5); ++++ y = _mm256_sub_ps(y, tmp); ++++ y = _mm256_add_ps(y, _ps256_1); ++++ ++++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ ++++ ++++ y2 = _ps256_sincof_p0; ++++ y2 = _mm256_mul_ps(y2, z); ++++ y2 = _mm256_add_ps(y2, _ps256_sincof_p1); ++++ y2 = _mm256_mul_ps(y2, z); ++++ y2 = _mm256_add_ps(y2, _ps256_sincof_p2); ++++ y2 = _mm256_mul_ps(y2, z); ++++ y2 = _mm256_mul_ps(y2, x); ++++ y2 = _mm256_add_ps(y2, x); ++++ ++++ /* select the correct result from the two polynoms */ ++++ xmm3 = poly_mask; ++++ ysin2 = _mm256_and_ps(xmm3, y2); ++++ ysin1 = _mm256_andnot_ps(xmm3, y); ++++ y2 = _mm256_sub_ps(y2,ysin2); ++++ y = _mm256_sub_ps(y, ysin1); ++++ ++++ xmm1 = _mm256_add_ps(ysin1,ysin2); ++++ xmm2 = _mm256_add_ps(y,y2); ++++ ++++ /* update the sign */ ++++ s = _mm256_xor_ps(xmm1, sign_bit_sin); ++++ c = _mm256_xor_ps(xmm2, sign_bit_cos); ++++ ++++ //GNSS-SDR needs to return -sin ++++ s = _mm256_xor_ps(s, _ps256_sign_mask); ++++ ++++ _mm256_store_ps ((float*)sin_value, s); ++++ _mm256_store_ps ((float*)cos_value, c); ++++ ++++ for(int i = 0; i < 8; i++) ++++ { ++++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); ++++ } ++++ d_carr_sign += 8; ++++ ++++ phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array); ++++ } ++++ ++++ if (num_points%8!=0) ++++ { ++++ __VOLK_ATTR_ALIGNED(32) float phase_rad_store[8]; ++++ _mm256_store_ps ((float*)phase_rad_store, phase_rad_array); ++++ ++++ float phase_rad = phase_rad_store[0]; ++++ ++++ for(int i = 0; i < num_points%8; i++) ++++ { ++++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++++ d_carr_sign++; ++++ phase_rad += phase_step_rad; ++++ } ++++ } ++++} ++++#endif /* LV_HAVE_AVX */ ++++ ++++#ifdef LV_HAVE_SSE2 ++++#include ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++++ ++++// float* pointer1 = (float*)&phase_rad_init; ++++// *pointer1 = 0; ++++// float* pointer2 = (float*)&phase_step_rad; ++++// *pointer2 = 0.5; ++++ ++++ const unsigned int sse_iters = num_points / 4; ++++ ++++ __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f); ++++ __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f); ++++ __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f); ++++ __m128 _ps_sign_mask = _mm_set1_ps(-0.f); ++++ __m128i _pi32_1 = _mm_set1_epi32(1); ++++ __m128i _pi32_inv1 = _mm_set1_epi32(~1); ++++ __m128i _pi32_2 = _mm_set1_epi32(2); ++++ __m128i _pi32_4 = _mm_set1_epi32(4); ++++ __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI ++++ __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f); ++++ __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f); ++++ __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f); ++++ __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f); ++++ __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f); ++++ __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f); ++++ __m128 _ps_1 = _mm_set1_ps(1.f); ++++ __m128 _ps_0p5 = _mm_set1_ps(0.5f); ++++ ++++ __m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad); ++++ ++++ __m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2; ++++ __m128 xmm1, xmm2, xmm3, sign_bit_sin; ++++ __m128i emm0, emm2, emm4; ++++ __VOLK_ATTR_ALIGNED(16) float sin_value[4]; ++++ __VOLK_ATTR_ALIGNED(16) float cos_value[4]; ++++ ++++ phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init); ++++ ++++ for(int i = 0; i < sse_iters; i++) ++++ { ++++ x = phase_rad_array; ++++ ++++ /* extract the sign bit (upper one) */ ++++ sign_bit_sin = _mm_and_ps(x, _ps_sign_mask); ++++ ++++ /* take the absolute value */ ++++ x = _mm_xor_ps(x, sign_bit_sin); ++++ ++++ /* scale by 4/Pi */ ++++ y = _mm_mul_ps(x, _ps_cephes_FOPI); ++++ ++++ /* store the integer part of y in emm2 */ ++++ emm2 = _mm_cvttps_epi32(y); ++++ ++++ /* j=(j+1) & (~1) (see the cephes sources) */ ++++ emm2 = _mm_add_epi32(emm2, _pi32_1); ++++ emm2 = _mm_and_si128(emm2, _pi32_inv1); ++++ y = _mm_cvtepi32_ps(emm2); ++++ ++++ emm4 = emm2; ++++ ++++ /* get the swap sign flag for the sine */ ++++ emm0 = _mm_and_si128(emm2, _pi32_4); ++++ emm0 = _mm_slli_epi32(emm0, 29); ++++ swap_sign_bit_sin = _mm_castsi128_ps(emm0); ++++ ++++ /* get the polynom selection mask for the sine*/ ++++ emm2 = _mm_and_si128(emm2, _pi32_2); ++++ emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); ++++ poly_mask = _mm_castsi128_ps(emm2); ++++ ++++ /* The magic pass: "Extended precision modular arithmetic" ++++ x = ((x - y * DP1) - y * DP2) - y * DP3; */ ++++ xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1); ++++ xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2); ++++ xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3); ++++ x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3)); ++++ ++++ emm4 = _mm_sub_epi32(emm4, _pi32_2); ++++ emm4 = _mm_andnot_si128(emm4, _pi32_4); ++++ emm4 = _mm_slli_epi32(emm4, 29); ++++ sign_bit_cos = _mm_castsi128_ps(emm4); ++++ ++++ sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); ++++ ++++ /* Evaluate the first polynom (0 <= x <= Pi/4) */ ++++ z = _mm_mul_ps(x,x); ++++ y = _ps_coscof_p0; ++++ y = _mm_mul_ps(y, z); ++++ y = _mm_add_ps(y, _ps_coscof_p1); ++++ y = _mm_mul_ps(y, z); ++++ y = _mm_add_ps(y, _ps_coscof_p2); ++++ y = _mm_mul_ps(y, _mm_mul_ps(z, z)); ++++ tmp = _mm_mul_ps(z, _ps_0p5); ++++ y = _mm_sub_ps(y, tmp); ++++ y = _mm_add_ps(y, _ps_1); ++++ ++++ /* Evaluate the second polynom (Pi/4 <= x <= 0) */ ++++ y2 = _ps_sincof_p0; ++++ y2 = _mm_mul_ps(y2, z); ++++ y2 = _mm_add_ps(y2, _ps_sincof_p1); ++++ y2 = _mm_mul_ps(y2, z); ++++ y2 = _mm_add_ps(y2, _ps_sincof_p2); ++++ y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x)); ++++ y2 = _mm_add_ps(y2, x); ++++ ++++ /* select the correct result from the two polynoms */ ++++ xmm3 = poly_mask; ++++ ysin2 = _mm_and_ps(xmm3, y2); ++++ ysin1 = _mm_andnot_ps(xmm3, y); ++++ y2 = _mm_sub_ps(y2,ysin2); ++++ y = _mm_sub_ps(y, ysin1); ++++ ++++ xmm1 = _mm_add_ps(ysin1,ysin2); ++++ xmm2 = _mm_add_ps(y,y2); ++++ ++++ /* update the sign */ ++++ s = _mm_xor_ps(xmm1, sign_bit_sin); ++++ c = _mm_xor_ps(xmm2, sign_bit_cos); ++++ ++++ //GNSS-SDR needs to return -sin ++++ s = _mm_xor_ps(s, _ps_sign_mask); ++++ ++++ _mm_store_ps ((float*)sin_value, s); ++++ _mm_store_ps ((float*)cos_value, c); ++++ ++++ for(int i = 0; i < 4; i++) ++++ { ++++ d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]); ++++ } ++++ d_carr_sign += 4; ++++ ++++ phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array); ++++ } ++++ ++++ if (num_points%4!=0) ++++ { ++++ __VOLK_ATTR_ALIGNED(16) float phase_rad_store[4]; ++++ _mm_store_si128 ((__m128i*)phase_rad_store, phase_rad_array); ++++ ++++ float phase_rad = phase_rad_store[0]; ++++ ++++ for(int i = 0; i < num_points%4; i++) ++++ { ++++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++++ d_carr_sign++; ++++ phase_rad += phase_step_rad; ++++ } ++++ } ++++} ++++#endif /* LV_HAVE_SSE2 */ ++++ ++++#ifdef LV_HAVE_GENERIC ++++/*! ++++ \brief Accumulates the values in the input buffer ++++ \param result The accumulated result ++++ \param inputBuffer The buffer of data to be accumulated ++++ \param num_points The number of values in inputBuffer to be accumulated ++++ */ ++++static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){ ++++ ++++// float* pointer1 = (float*)&phase_rad_init; ++++// *pointer1 = 0; ++++// float* pointer2 = (float*)&phase_step_rad; ++++// *pointer2 = 0.5; ++++ ++++ float phase_rad = phase_rad_init; ++++ for(int i = 0; i < num_points; i++) ++++ { ++++ *d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad)); ++++ d_carr_sign++; ++++ phase_rad += phase_step_rad; ++++ } ++++} ++++#endif /* LV_HAVE_GENERIC */ ++++#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */ ++++ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt +++--- /Users/andres/Desktop/volk_gnsssdr/lib/CMakeLists.txt 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/CMakeLists.txt 2014-10-15 01:55:08.000000000 +0200 +++@@ -406,8 +406,10 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8 +++ # if we find one that matches our current system architecture +++ # set up the assembler flags and include the source files +++ foreach(ARCH ${ASM_ARCHS_AVAILABLE}) ++++ message(STATUS "--==>> -CFLAGS1: ${FULL_C_FLAGS}") +++ string(REGEX MATCH "${ARCH}" ASM_ARCH "${FULL_C_FLAGS}") +++ if( ASM_ARCH STREQUAL "armv7" ) ++++ set(ASM-ATT $ENV{ASM}) +++ message(STATUS "---- Adding ASM files") # we always use ATT syntax +++ message(STATUS "-- Detected armv7 architecture; enabling ASM") +++ # setup architecture specific assembler flags +++@@ -420,20 +422,13 @@ if(${CMAKE_VERSION} VERSION_GREATER "2.8 +++ message(STATUS "Adding source file: ${asm_file}") +++ endforeach(asm_file) +++ endif() +++- enable_language(ASM) +++- set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS}) +++- message(STATUS "c flags: ${FULL_C_FLAGS}") +++- message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}") ++++ set(CMAKE_ASM-ATT_FLAGS_INIT ${ARCH_ASM_FLAGS}) ++++ enable_language(ASM-ATT) # this must be after flags_init ++++ message(STATUS "asm flags: ${CMAKE_ASM-ATT_FLAGS}") +++ endforeach(ARCH) +++ +++ else(${CMAKE_VERSION} VERSION_GREATER "2.8.9") +++ message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.") +++- foreach(machine_name ${available_machines}) +++- string(REGEX MATCH "neon" NEON_MACHINE ${machine_name}) +++- if( NEON_MACHINE STREQUAL "neon") +++- message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support") +++- endif() +++- endforeach() +++ endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9") +++ +++ ######################################################################## +++@@ -517,11 +512,24 @@ if(MSVC) +++ endif() +++ +++ #create the volk_gnsssdr runtime library +++-add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) ++++ ++++#MODIFICATIONS BY GNSS-SDR ++++file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc) ++++file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt) ++++ ++++#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources}) ++++add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc}) ++++ ++++source_group("Kernels" FILES ${h_files}) ++++source_group("Common Macros" FILES ${CommonMacros}) ++++source_group("ORC Files" FILES ${orc}) ++++#END OF MODIFICATIONS ++++ +++ target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries}) +++ set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER}) +++ set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS") +++ ++++ +++ install(TARGETS volk_gnsssdr +++ LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_runtime" # .so file +++ ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_devel" # .lib file +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc +++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.cc 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.cc 2014-10-17 01:54:35.000000000 +0200 +++@@ -5,9 +5,7 @@ +++ #include +++ #include +++ #include +++-#include +++ #include +++-#include +++ #include +++ #include +++ #include +++@@ -217,6 +215,72 @@ inline void run_cast_test3_s32fc(volk_gn +++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +++ } +++ ++++//ADDED BY GNSS-SDR. START ++++inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++++} ++++ ++++inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { ++++ while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str()); ++++} ++++//ADDED BY GNSS-SDR. END ++++ +++ // This function is a nop that helps resolve GNU Radio bugs 582 and 583. +++ // Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast(float tol) +++ // won't happen on armhf (reported on cortex A9 and A15). +++@@ -330,9 +394,9 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ lv_32fc_t scalar, +++ int vlen, +++ int iter, +++- std::vector *results, +++- std::string puppet_master_name, +++- bool benchmark_mode, ++++ std::vector *best_arch_vector = 0, ++++ std::string puppet_master_name = "NULL", ++++ bool benchmark_mode, +++ std::string kernel_regex +++ ) { +++ boost::xpressive::sregex kernel_expression = boost::xpressive::sregex::compile(kernel_regex); +++@@ -340,12 +404,6 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ // in this case we have a regex and are only looking to test one kernel +++ return false; +++ } +++- if(results) { +++- results->push_back(volk_gnsssdr_test_results_t()); +++- results->back().name = name; +++- results->back().vlen = vlen; +++- results->back().iter = iter; +++- } +++ std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl; +++ +++ // The multiply and lv_force_cast_hf are work arounds for GNU Radio bugs 582 and 583 +++@@ -426,7 +484,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ } else { +++ run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++- } else throw "unsupported 1 arg function >1 scalars"; ++++ } ++++ //ADDED BY GNSS-SDR. START ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ //ADDED BY GNSS-SDR. END ++++ else throw "unsupported 1 arg function >1 scalars"; +++ break; +++ case 2: +++ if(inputsc.size() == 0) { +++@@ -437,7 +505,17 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ } else { +++ run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++- } else throw "unsupported 2 arg function >1 scalars"; ++++ } ++++ //ADDED BY GNSS-SDR. START ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ //ADDED BY GNSS-SDR. END ++++ else throw "unsupported 2 arg function >1 scalars"; +++ break; +++ case 3: +++ if(inputsc.size() == 0) { +++@@ -448,11 +526,61 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ } else { +++ run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); +++ } +++- } else throw "unsupported 3 arg function >1 scalars"; ++++ } ++++ //ADDED BY GNSS-SDR. START ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ //ADDED BY GNSS-SDR. END ++++ else throw "unsupported 3 arg function >1 scalars"; +++ break; +++ case 4: +++ run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); +++ break; ++++ //ADDED BY GNSS-SDR. START ++++ case 8: ++++ if(inputsc.size() == 0) { ++++ run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ else throw "unsupported 8 arg function >1 scalars"; ++++ break; ++++ case 12: ++++ if(inputsc.size() == 0) { ++++ run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); ++++ } else if(inputsc.size() == 1 && inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ else if(inputsc.size() == 1 && !inputsc[0].is_float) { ++++ if(inputsc[0].is_complex) { ++++ run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); ++++ } else { ++++ run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); ++++ } ++++ } ++++ else throw "unsupported 12 arg function >1 scalars"; ++++ break; ++++ //ADDED BY GNSS-SDR. END +++ default: +++ throw "no function handler for this signature"; +++ break; +++@@ -461,13 +589,6 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ end = clock(); +++ double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC; +++ std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl; +++- if(results) { +++- volk_gnsssdr_test_time_t result; +++- result.name = arch_list[i]; +++- result.time = arch_time; +++- result.units = "ms"; +++- results->back().results[result.name] = result; +++- } +++ +++ profile_times.push_back(arch_time); +++ } +++@@ -568,14 +689,13 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr +++ +++ std::cout << "Best aligned arch: " << best_arch_a << std::endl; +++ std::cout << "Best unaligned arch: " << best_arch_u << std::endl; +++- if(results) { ++++ if(best_arch_vector) { +++ if(puppet_master_name == "NULL") { +++- results->back().config_name = name; +++- } else { +++- results->back().config_name = puppet_master_name; ++++ best_arch_vector->push_back(name + " " + best_arch_a + " " + best_arch_u); ++++ } ++++ else { ++++ best_arch_vector->push_back(puppet_master_name + " " + best_arch_a + " " + best_arch_u); +++ } +++- results->back().best_arch_a = best_arch_a; +++- results->back().best_arch_u = best_arch_u; +++ } +++ +++ return fail_global; +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h +++--- /Users/andres/Desktop/volk_gnsssdr/lib/qa_utils.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/qa_utils.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -3,10 +3,7 @@ +++ +++ #include +++ #include +++-#include +++-#include +++ #include +++-#include +++ #include +++ #include +++ +++@@ -24,46 +21,10 @@ volk_gnsssdr_type_t volk_gnsssdr_type_fr +++ float uniform(void); +++ void random_floats(float *buf, unsigned n); +++ +++-class volk_gnsssdr_test_time_t { +++- public: +++- std::string name; +++- double time; +++- std::string units; +++-}; ++++bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t, void(*)(), std::string, float, lv_32fc_t, int, int, std::vector *, std::string, bool benchmark_mode=false, std::string kernel_regex=""); +++ +++-class volk_gnsssdr_test_results_t { +++- public: +++- std::string name; +++- std::string config_name; +++- int vlen; +++- int iter; +++- std::map results; +++- std::string best_arch_a; +++- std::string best_arch_u; +++-}; +++ +++-bool run_volk_gnsssdr_tests( +++- volk_gnsssdr_func_desc_t, +++- void(*)(), +++- std::string, +++- float, +++- lv_32fc_t, +++- int, +++- int, +++- std::vector *results = NULL, +++- std::string puppet_master_name = "NULL", +++- bool benchmark_mode=false, +++- std::string kernel_regex="" +++- ); +++- +++- +++-#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ +++- BOOST_AUTO_TEST_CASE(func##_test) { \ +++- BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ +++- func##_get_func_desc(), (void (*)())func##_manual, \ +++- std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ +++- 0); \ +++- } ++++#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) BOOST_AUTO_TEST_CASE(func##_test) { BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, 0, "NULL"), 0); } +++ #define VOLK_PROFILE(func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL", bnmode, kernel_regex) +++ #define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func), bnmode, kernel_regex) +++ typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place +++@@ -77,4 +38,25 @@ typedef void (*volk_gnsssdr_fn_1arg_s32f +++ typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); +++ typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); +++ ++++//ADDED BY GNSS-SDR. START ++++typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input ++++typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input ++++typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++++ ++++typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++++ ++++typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*); ++++typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*); ++++//ADDED BY GNSS-SDR. END ++++ +++ #endif //VOLK_QA_UTILS_H +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc +++--- /Users/andres/Desktop/volk_gnsssdr/lib/testqa.cc 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/lib/testqa.cc 2014-10-15 01:55:08.000000000 +0200 +++@@ -24,6 +24,58 @@ +++ #include +++ #include +++ ++++//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1); ++++ ++++//GNSS-SDR PROTO-KERNELS ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1); ++++ ++++VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1); ++++ ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1); ++++ ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1); ++++ ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 20462, 1); ++++ ++++VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 20462, 1); ++++VOLK_RUN_TESTS(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 20462, 1); ++++ ++++ ++++ ++++ ++++ ++++ ++++ +++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000); +++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000); +++ //VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_16i, 0, 0, 20462, 10000); +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32f_x2_add_32f.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32f_x2_add_32f.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,5 @@ ++++.function volk_gnsssdr_32f_x2_add_32f_a_orc_impl ++++.dest 4 dst ++++.source 4 src1 ++++.source 4 src2 ++++addf dst, src1, src2 +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_s32fc_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,18 @@ ++++.function volk_gnsssdr_32fc_s32fc_multiply_32fc_a_orc_impl ++++.source 8 src1 ++++.floatparam 8 scalar ++++.dest 8 dst ++++.temp 8 iqprod ++++.temp 4 real ++++.temp 4 imag ++++.temp 4 ac ++++.temp 4 bd ++++.temp 8 swapped ++++x2 mulf iqprod, src1, scalar ++++splitql bd, ac, iqprod ++++subf real, ac, bd ++++swaplq swapped, src1 ++++x2 mulf iqprod, swapped, scalar ++++splitql bd, ac, iqprod ++++addf imag, ac, bd ++++mergelq dst, real, imag +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_32fc_x2_multiply_32fc.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,18 @@ ++++.function volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl ++++.source 8 src1 ++++.source 8 src2 ++++.dest 8 dst ++++.temp 8 iqprod ++++.temp 4 real ++++.temp 4 imag ++++.temp 4 ac ++++.temp 4 bd ++++.temp 8 swapped ++++x2 mulf iqprod, src1, src2 ++++splitql bd, ac, iqprod ++++subf real, ac, bd ++++swaplq swapped, src1 ++++x2 mulf iqprod, swapped, src2 ++++splitql bd, ac, iqprod ++++addf imag, ac, bd ++++mergelq dst, real, imag +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_accumulator_s8i.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_accumulator_s8i.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,40 @@ ++++#/*! ++++# * \file volk_gnsssdr_8i_accumulator_s8i.orc ++++# * \brief ORC implementation: 8 bits (char) scalar accumulator ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that implements an accumulator of char values ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8i_accumulator_s8i_a_orc_impl ++++.source 1 src1 ++++.accumulator 2 acc ++++.temp 2 sum ++++mergebw sum, 0, src1 ++++accw acc, sum +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8i_x2_add_8i.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8i_x2_add_8i.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,39 @@ ++++#/*! ++++# * \file volk_gnsssdr_8i_x2_add_8i.orc ++++# * \brief ORC implementation: adds pairs of 8 bits (char) scalars ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that adds pairs of 8 bits (char) scalars ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8i_x2_add_8i_a_orc_impl ++++.dest 1 dst ++++.source 1 src1 ++++.source 1 src2 ++++addb dst, src1, src2 +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_conjugate_8ic.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,42 @@ ++++#/*! ++++# * \file volk_gnsssdr_8ic_conjugate_8ic.orc ++++# * \brief ORC implementation: calculates the conjugate of a 16 bits vector ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that calculates the conjugate of a ++++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part) ++++# * result = (real*real) + (imag*imag) ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl ++++.source 2 src1 ++++.dest 2 dst ++++.temp 2 merged ++++mergebw merged, 1, -1 ++++x2 mullb dst, merged, src1 +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_magnitude_squared_8i.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,45 @@ ++++#/*! ++++# * \file volk_gnsssdr_8ic_magnitude_squared_8i.orc ++++# * \brief ORC implementation: calculates the magnitude squared of a 16 bits vector ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that calculates the magnitude squared of a ++++# * 16 bits vector (8 bits the real part and 8 bits the imaginary part) ++++# * result = (real*real) + (imag*imag) ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl ++++.source 2 src1 ++++.dest 1 dst ++++.temp 2 iqprod ++++.temp 1 ac ++++.temp 1 bd ++++x2 mullb iqprod, src1, src1 ++++splitwb bd, ac, iqprod ++++addb dst, ac, bd +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_s8ic_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,58 @@ ++++#/*! ++++# * \file volk_gnsssdr_8ic_s8ic_multiply_8ic.orc ++++# * \brief ORC implementation: multiplies a group of 16 bits vectors by one constant vector ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that multiplies a group of 16 bits vectors ++++# * (8 bits the real part and 8 bits the imaginary part) by one constant vector ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl ++++.source 2 src1 ++++.param 2 src2real ++++.param 2 src2imag ++++.dest 2 dst ++++.temp 2 iqprod ++++.temp 1 real ++++.temp 1 imag ++++.temp 1 rr ++++.temp 1 ii ++++.temp 1 ri ++++.temp 1 ir ++++x2 mullb iqprod, src1, src2real ++++splitwb ir, rr, iqprod ++++x2 mullb iqprod, src1, src2imag ++++splitwb ii, ri, iqprod ++++subb real, rr, ii ++++addb imag, ri, ir ++++mergebw dst, real, imag ++++ ++++ ++++ ++++ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_dot_prod_8ic.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,59 @@ ++++#/*! ++++# * \file volk_gnsssdr_8ic_x2_dot_prod_8ic.orc ++++# * \brief ORC implementation: multiplies two 16 bits vectors and accumulates them ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that multiplies two 16 bits vectors (8 bits the real part ++++# * and 8 bits the imaginary part) and accumulates them ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl ++++.source 2 src1 ++++.source 2 src2 ++++.accumulator 2 accreal ++++.accumulator 2 accimag ++++.temp 2 iqprod ++++.temp 1 real ++++.temp 1 imag ++++.temp 2 real2 ++++.temp 2 imag2 ++++.temp 1 ac ++++.temp 1 bd ++++.temp 2 swapped ++++x2 mullb iqprod, src1, src2 ++++splitwb bd, ac, iqprod ++++subb real, ac, bd ++++swapw swapped, src1 ++++x2 mullb iqprod, swapped, src2 ++++splitwb bd, ac, iqprod ++++addb imag, ac, bd ++++mergebw real2, 0, real ++++accw accreal, real2 ++++mergebw imag2, 0, imag ++++accw accimag, imag2 +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x2_multiply_8ic.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,57 @@ ++++#/*! ++++# * \file volk_gnsssdr_8ic_x2_multiply_8ic.orc ++++# * \brief ORC implementation: multiplies two 16 bits vectors ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that multiplies two 16 bits vectors (8 bits the real part ++++# * and 8 bits the imaginary part) ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl ++++.source 2 src1 ++++.source 2 src2 ++++.dest 2 dst ++++.temp 2 iqprod ++++.temp 1 real ++++.temp 1 imag ++++.temp 1 ac ++++.temp 1 bd ++++.temp 2 swapped ++++x2 mullb iqprod, src1, src2 ++++splitwb bd, ac, iqprod ++++subb real, ac, bd ++++swapw swapped, src1 ++++x2 mullb iqprod, swapped, src2 ++++splitwb bd, ac, iqprod ++++addb imag, ac, bd ++++mergebw dst, real, imag ++++ ++++ ++++ ++++ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,139 @@ ++++#/*! ++++# * \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.orc ++++# * \brief ORC implementation: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that performs the carrier wipe-off mixing and the ++++# * Early, Prompt, and Late correlation with 16 bits vectors (8 bits the ++++# * real part and 8 bits the imaginary part): ++++# * - The carrier wipe-off is done by multiplying the input signal by the ++++# * carrier (multiplication of 16 bits vectors) It returns the input ++++# * signal in base band (BB) ++++# * - Early values are calculated by multiplying the input signal in BB by the ++++# * early code (multiplication of 16 bits vectors), accumulating the results ++++# * - Prompt values are calculated by multiplying the input signal in BB by the ++++# * prompt code (multiplication of 16 bits vectors), accumulating the results ++++# * - Late values are calculated by multiplying the input signal in BB by the ++++# * late code (multiplication of 16 bits vectors), accumulating the results ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl ++++.source 2 input ++++.source 2 carrier ++++.source 2 E_code ++++.source 2 P_code ++++.accumulator 2 E_out_real ++++.accumulator 2 E_out_imag ++++.accumulator 2 P_out_real ++++.accumulator 2 P_out_imag ++++.temp 2 bb_signal_sample ++++.temp 2 iqprod ++++.temp 1 real ++++.temp 1 imag ++++.temp 1 ac ++++.temp 1 bd ++++.temp 2 swapped ++++ ++++.temp 2 real2 ++++.temp 2 imag2 ++++ ++++x2 mullb iqprod, input, carrier ++++splitwb bd, ac, iqprod ++++subb real, ac, bd ++++swapw swapped, input ++++x2 mullb iqprod, swapped, carrier ++++splitwb bd, ac, iqprod ++++addb imag, ac, bd ++++mergebw bb_signal_sample, real, imag ++++ ++++swapw swapped, bb_signal_sample ++++ ++++x2 mullb iqprod, bb_signal_sample, E_code ++++splitwb bd, ac, iqprod ++++subb real, ac, bd ++++x2 mullb iqprod, swapped, E_code ++++splitwb bd, ac, iqprod ++++addb imag, ac, bd ++++mergebw real2, 0, real ++++mergebw imag2, 0, imag ++++accw E_out_real, real2 ++++accw E_out_imag, imag2 ++++ ++++x2 mullb iqprod, bb_signal_sample, P_code ++++splitwb bd, ac, iqprod ++++subb real, ac, bd ++++x2 mullb iqprod, swapped, P_code ++++splitwb bd, ac, iqprod ++++addb imag, ac, bd ++++mergebw real2, 0, real ++++mergebw imag2, 0, imag ++++accw P_out_real, real2 ++++accw P_out_imag, imag2 ++++ ++++.function volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl ++++.source 2 input ++++.source 2 carrier ++++.source 2 L_code ++++.accumulator 2 L_out_real ++++.accumulator 2 L_out_imag ++++ ++++.temp 2 bb_signal_sample ++++.temp 2 iqprod ++++.temp 1 real ++++.temp 1 imag ++++.temp 1 ac ++++.temp 1 bd ++++.temp 2 swapped ++++ ++++.temp 2 real2 ++++.temp 2 imag2 ++++ ++++x2 mullb iqprod, input, carrier ++++splitwb bd, ac, iqprod ++++subb real, ac, bd ++++swapw swapped, input ++++x2 mullb iqprod, swapped, carrier ++++splitwb bd, ac, iqprod ++++addb imag, ac, bd ++++mergebw bb_signal_sample, real, imag ++++ ++++swapw swapped, bb_signal_sample ++++ ++++x2 mullb iqprod, bb_signal_sample, L_code ++++splitwb bd, ac, iqprod ++++subb real, ac, bd ++++x2 mullb iqprod, swapped, L_code ++++splitwb bd, ac, iqprod ++++addb imag, ac, bd ++++mergebw real2, 0, real ++++mergebw imag2, 0, imag ++++accw L_out_real, real2 ++++accw L_out_imag, imag2 ++++ ++++ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc +++--- /Users/andres/Desktop/volk_gnsssdr/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/orc/volk_gnsssdr_8u_x2_multiply_8u.orc 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,39 @@ ++++#/*! ++++# * \file volk_gnsssdr_8u_x2_multiply_8u.orc ++++# * \brief ORC implementation: multiplies unsigned char values ++++# * \authors
    ++++# *
  • Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com ++++# *
++++# * ++++# * ORC code that multiplies unsigned char values (8 bits data) ++++# * ++++# * ------------------------------------------------------------------------- ++++# * ++++# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) ++++# * ++++# * GNSS-SDR is a software defined Global Navigation ++++# * Satellite Systems receiver ++++# * ++++# * This file is part of GNSS-SDR. ++++# * ++++# * GNSS-SDR is free software: you can redistribute it and/or modify ++++# * it under the terms of the GNU General Public License as published by ++++# * the Free Software Foundation, either version 3 of the License, or ++++# * at your option) any later version. ++++# * ++++# * GNSS-SDR is distributed in the hope that it will be useful, ++++# * but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# * GNU General Public License for more details. ++++# * ++++# * You should have received a copy of the GNU General Public License ++++# * along with GNSS-SDR. If not, see . ++++# * ++++# * ------------------------------------------------------------------------- ++++# */ ++++ ++++.function volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl ++++.source 1 src1 ++++.source 1 src2 ++++.dest 1 dst ++++mullb dst, src1, src2 +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/CMakeLists.txt /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/CMakeLists.txt +++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/CMakeLists.txt 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/CMakeLists.txt 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,39 @@ ++++# Copyright 2013 Free Software Foundation, Inc. ++++# ++++# This file is part of GNU Radio ++++# ++++# GNU Radio is free software; you can redistribute it and/or modify ++++# it under the terms of the GNU General Public License as published by ++++# the Free Software Foundation; either version 3, or (at your option) ++++# any later version. ++++# ++++# GNU Radio is distributed in the hope that it will be useful, ++++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# GNU General Public License for more details. ++++# ++++# You should have received a copy of the GNU General Public License ++++# along with GNU Radio; see the file COPYING. If not, write to ++++# the Free Software Foundation, Inc., 51 Franklin Street, ++++# Boston, MA 02110-1301, USA. ++++ ++++######################################################################## ++++# Install python files and apps ++++######################################################################## ++++include(GrPython) ++++ ++++VOLK_PYTHON_INSTALL( ++++ FILES ++++ __init__.py ++++ cfg.py ++++ volk_gnsssdr_modtool_generate.py ++++ DESTINATION ${VOLK_PYTHON_DIR}/volk_gnsssdr_modtool ++++ COMPONENT "volk_gnsssdr" ++++) ++++ ++++VOLK_PYTHON_INSTALL( ++++ PROGRAMS ++++ volk_gnsssdr_modtool ++++ DESTINATION ${VOLK_RUNTIME_DIR} ++++ COMPONENT "volk_gnsssdr" ++++) +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/README /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/README +++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/README 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/README 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,114 @@ ++++The volk_gnsssdr_modtool tool is installed along with VOLK as a way of helping ++++to construct, add to, and interogate the VOLK library or companion ++++libraries. ++++ ++++volk_gnsssdr_modtool is installed into $prefix/bin. ++++ ++++VOLK modtool enables creating standalone (out-of-tree) VOLK modules ++++and provides a few tools for sharing VOLK kernels between VOLK ++++modules. If you need to design or work with VOLK kernels away from ++++the canonical VOLK library, this is the tool. If you need to tailor ++++your own VOLK library for whatever reason, this is the tool. ++++ ++++The canonical VOLK library installs a volk_gnsssdr.h and a libvolk_gnsssdr.so. Your ++++own library will install volk_gnsssdr_$name.h and libvolk_gnsssdr_$name.so. Ya Gronk? ++++Good. ++++ ++++There isn't a substantial difference between the canonical VOLK ++++module and any other VOLK module. They're all peers. Any module ++++created via VOLK modtool will come complete with a default ++++volk_gnsssdr_modtool.cfg file associating the module with the base from which ++++it came, its distinctive $name and its destination (or path). These ++++values (created from user input if VOLK modtool runs without a ++++user-supplied config file or a default config file) serve as default ++++values for some VOLK modtool actions. It's more or less intended for ++++the user to change directories to the top level of a created VOLK ++++module and then run volk_gnsssdr_modtool to take advantage of the values ++++stored in the default volk_gnsssdr_modtool.cfg file. ++++ ++++Apart from creating new VOLK modules, VOLK modtool allows you to list ++++the names of kernels in other modules, list the names of kernels in ++++the current module, add kernels from another module into the current ++++module, and remove kernels from the current module. When moving ++++kernels between modules, VOLK modtool does its best to keep the qa ++++and profiling code for those kernels intact. If the base has a test ++++or a profiling call for some kernel, those calls will follow the ++++kernel when VOLK modtool adds that kernel. If QA or profiling ++++requires a puppet kernel, the puppet kernel will follow the original ++++kernel when VOLK modtool adds that original kernel. VOLK modtool ++++respects puppets. ++++ ++++====================================================================== ++++ ++++Installing a new VOLK Library: ++++ ++++Run the command "volk_gnsssdr_modtool -i". This will ask you three questions: ++++ ++++ name: // the name to give your VOLK library: volk_gnsssdr_ ++++ destination: // directory new source tree is built under -- must exists. ++++ // It will create /volk_gnsssdr_ ++++ base: // the directory containing the original VOLK source code ++++ ++++The name provided must be alphanumeric (and cannot start with a ++++number). No special characters including dashes and underscores are ++++allowed. ++++ ++++This will build a new skeleton directory in the destination provided ++++with the name volk_gnsssdr_. It will contain the necessary structure to ++++build: ++++ ++++ mkdir build ++++ cd build ++++ cmake -DCMAKE_INSTALL_PREFIX=/opt/volk_gnsssdr ../ ++++ make ++++ sudo make install ++++ ++++Right now, the library is empty and contains no kernels. Kernels can ++++be added from another VOLK library using the '-a' option. If not ++++specified, the kernel will be extracted from the base VOLK ++++directory. Using the '-b' allows us to specify another VOLK library to ++++use for this purpose. ++++ ++++ volk_gnsssdr_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc ++++ ++++This will put the code for the new kernel into ++++/volk_gnsssdr_/kernels/volk_gnsssdr_/ ++++ ++++Other kernels must be added by hand. See the following webpages for ++++more information about creating VOLK kernels: ++++ http://gnuradio.org/doc/doxygen/volk_gnsssdr_guide.html ++++ http://gnuradio.org/redmine/projects/gnuradio/wiki/Volk ++++ ++++ ++++====================================================================== ++++ ++++OPTIONS ++++ ++++Options for Adding and Removing Kernels: ++++ -a, --add_kernel ++++ Add kernel from existing VOLK module. Uses the base VOLK module ++++ unless -b is used. Use -n to specify the kernel name. ++++ Requires: -n. ++++ Optional: -b ++++ ++++ -A, --add_all_kernels ++++ Add all kernels from existing VOLK module. Uses the base VOLK ++++ module unless -b is used. ++++ Optional: -b ++++ ++++ -x, --remove_kernel ++++ Remove kernel from module. ++++ Required: -n. ++++ Optional: -b ++++ ++++Options for Listing Kernels: ++++ -l, --list ++++ Lists all kernels available in the base VOLK module. ++++ ++++ -k, --kernels ++++ Lists all kernels in this VOLK module. ++++ ++++ -r, --remote-list ++++ Lists all kernels in another VOLK module that is specified ++++ using the -b option. ++++ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.py /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.py +++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.py 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.py 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,24 @@ ++++#!/usr/bin/env python ++++# ++++# Copyright 2013 Free Software Foundation, Inc. ++++# ++++# This file is part of GNU Radio ++++# ++++# GNU Radio is free software; you can redistribute it and/or modify ++++# it under the terms of the GNU General Public License as published by ++++# the Free Software Foundation; either version 3, or (at your option) ++++# any later version. ++++# ++++# GNU Radio is distributed in the hope that it will be useful, ++++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# GNU General Public License for more details. ++++# ++++# You should have received a copy of the GNU General Public License ++++# along with GNU Radio; see the file COPYING. If not, write to ++++# the Free Software Foundation, Inc., 51 Franklin Street, ++++# Boston, MA 02110-1301, USA. ++++# ++++ ++++from cfg import volk_gnsssdr_modtool_config ++++from volk_gnsssdr_modtool_generate import volk_gnsssdr_modtool +++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.pyc differ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.py /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.py +++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.py 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.py 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,104 @@ ++++#!/usr/bin/env python ++++# ++++# Copyright 2013 Free Software Foundation, Inc. ++++# ++++# This file is part of GNU Radio ++++# ++++# GNU Radio is free software; you can redistribute it and/or modify ++++# it under the terms of the GNU General Public License as published by ++++# the Free Software Foundation; either version 3, or (at your option) ++++# any later version. ++++# ++++# GNU Radio is distributed in the hope that it will be useful, ++++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# GNU General Public License for more details. ++++# ++++# You should have received a copy of the GNU General Public License ++++# along with GNU Radio; see the file COPYING. If not, write to ++++# the Free Software Foundation, Inc., 51 Franklin Street, ++++# Boston, MA 02110-1301, USA. ++++# ++++ ++++import ConfigParser ++++import sys ++++import os ++++import exceptions ++++import re ++++ ++++ ++++class volk_gnsssdr_modtool_config: ++++ def key_val_sub(self, num, stuff, section): ++++ return re.sub('\$' + 'k' + str(num), stuff[num][0], (re.sub('\$' + str(num), stuff[num][1], section[1][num]))); ++++ ++++ def verify(self): ++++ for i in self.verification: ++++ self.verify_section(i) ++++ def remap(self): ++++ for i in self.remapification: ++++ self.verify_section(i) ++++ ++++ def verify_section(self, section): ++++ stuff = self.cfg.items(section[0]) ++++ for i in range(len(section[1])): ++++ eval(self.key_val_sub(i, stuff, section)) ++++ try: ++++ val = eval(self.key_val_sub(i, stuff, section)) ++++ if val == False: ++++ raise exceptions.ValueError ++++ except ValueError: ++++ raise exceptions.ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1])) ++++ except: ++++ raise exceptions.IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1])) ++++ ++++ ++++ def __init__(self, cfg=None): ++++ self.config_name = 'config' ++++ self.config_defaults = ['name', 'destination', 'base'] ++++ self.config_defaults_remap = ['1', ++++ 'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))', ++++ 'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))'] ++++ ++++ self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')', ++++ 'os.path.exists(\'$1\')', ++++ 'os.path.exists(\'$2\')'] ++++ self.remapification = [(self.config_name, self.config_defaults_remap)] ++++ self.verification = [(self.config_name, self.config_defaults_verify)] ++++ default = os.path.join(os.getcwd(), 'volk_gnsssdr_modtool.cfg') ++++ icfg = ConfigParser.RawConfigParser() ++++ if cfg: ++++ icfg.read(cfg) ++++ elif os.path.exists(default): ++++ icfg.read(default) ++++ else: ++++ print "Initializing config file..." ++++ icfg.add_section(self.config_name) ++++ for kn in self.config_defaults: ++++ rv = raw_input("%s: "%(kn)) ++++ icfg.set(self.config_name, kn, rv) ++++ self.cfg = icfg ++++ self.remap() ++++ self.verify() ++++ ++++ ++++ ++++ def read_map(self, name, inp): ++++ if self.cfg.has_section(name): ++++ self.cfg.remove_section(name) ++++ self.cfg.add_section(name) ++++ for i in inp: ++++ self.cfg.set(name, i, inp[i]) ++++ ++++ def get_map(self, name): ++++ retval = {} ++++ stuff = self.cfg.items(name) ++++ for i in stuff: ++++ retval[i[0]] = i[1] ++++ return retval ++++ ++++ ++++ ++++ ++++ ++++ ++++ +++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.pyc differ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool +++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,128 @@ ++++#!/usr/bin/env python ++++# ++++# Copyright 2013 Free Software Foundation, Inc. ++++# ++++# This file is part of GNU Radio ++++# ++++# GNU Radio is free software; you can redistribute it and/or modify ++++# it under the terms of the GNU General Public License as published by ++++# the Free Software Foundation; either version 3, or (at your option) ++++# any later version. ++++# ++++# GNU Radio is distributed in the hope that it will be useful, ++++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# GNU General Public License for more details. ++++# ++++# You should have received a copy of the GNU General Public License ++++# along with GNU Radio; see the file COPYING. If not, write to ++++# the Free Software Foundation, Inc., 51 Franklin Street, ++++# Boston, MA 02110-1301, USA. ++++# ++++ ++++from volk_gnsssdr_modtool import volk_gnsssdr_modtool, volk_gnsssdr_modtool_config ++++from optparse import OptionParser, OptionGroup ++++ ++++import exceptions ++++import os ++++import sys ++++ ++++if __name__ == '__main__': ++++ parser = OptionParser(); ++++ actions = OptionGroup(parser, 'Actions'); ++++ actions.add_option('-i', '--install', action='store_true', ++++ help='Create a new volk_gnsssdr module.') ++++ parser.add_option('-b', '--base_path', action='store', default=None, ++++ help='Base path for action. By default, volk_gnsssdr_modtool.cfg loads this value.') ++++ parser.add_option('-n', '--kernel_name', action='store', default=None, ++++ help='Kernel name for action. No default') ++++ parser.add_option('-c', '--config', action='store', dest='config_file', default=None, ++++ help='Config file for volk_gnsssdr_modtool. By default, volk_gnsssdr_modtool.cfg in the local directory will be used/created.') ++++ actions.add_option('-a', '--add_kernel', action='store_true', ++++ help='Add kernel from existing volk_gnsssdr module. Requires: -n. Optional: -b') ++++ actions.add_option('-A', '--add_all_kernels', action='store_true', ++++ help='Add all kernels from existing volk_gnsssdr module. Optional: -b') ++++ actions.add_option('-x', '--remove_kernel', action='store_true', ++++ help='Remove kernel from module. Required: -n. Optional: -b') ++++ actions.add_option('-l', '--list', action='store_true', ++++ help='List all kernels in the base.') ++++ actions.add_option('-k', '--kernels', action='store_true', ++++ help='List all kernels in the module.') ++++ actions.add_option('-r', '--remote_list', action='store_true', ++++ help='List all available kernels in remote volk_gnsssdr module. Requires: -b.') ++++ actions.add_option('-m', '--moo', action='store_true', ++++ help='Have you mooed today?') ++++ parser.add_option_group(actions) ++++ ++++ (options, args) = parser.parse_args(); ++++ if len(sys.argv) < 2: ++++ parser.print_help() ++++ ++++ elif options.moo: ++++ print " (__) " ++++ print " (oo) " ++++ print " /------\/ " ++++ print " / | || " ++++ print " * /\---/\ " ++++ print " ~~ ~~ " ++++ ++++ else: ++++ my_cfg = volk_gnsssdr_modtool_config(options.config_file); ++++ ++++ my_modtool = volk_gnsssdr_modtool(my_cfg.get_map(my_cfg.config_name)); ++++ ++++ ++++ if options.install: ++++ my_modtool.make_module_skeleton(); ++++ my_modtool.write_default_cfg(my_cfg.cfg); ++++ ++++ ++++ if options.add_kernel: ++++ if not options.kernel_name: ++++ raise exceptions.IOError("This action requires the -n option."); ++++ else: ++++ name = options.kernel_name; ++++ if options.base_path: ++++ base = options.base_path; ++++ else: ++++ base = my_cfg.cfg.get(my_cfg.config_name, 'base'); ++++ my_modtool.import_kernel(name, base); ++++ ++++ if options.remove_kernel: ++++ if not options.kernel_name: ++++ raise exceptions.IOError("This action requires the -n option."); ++++ else: ++++ name = options.kernel_name; ++++ my_modtool.remove_kernel(name); ++++ ++++ if options.add_all_kernels: ++++ ++++ if options.base_path: ++++ base = options.base_path; ++++ else: ++++ base = my_cfg.cfg.get(my_cfg.config_name, 'base'); ++++ kernelset = my_modtool.get_current_kernels(base); ++++ for i in kernelset: ++++ my_modtool.import_kernel(i, base); ++++ ++++ if options.remote_list: ++++ if not options.base_path: ++++ raise exceptions.IOError("This action requires the -b option. Try -l or -k for listing kernels in the base or the module.") ++++ else: ++++ base = options.base_path; ++++ kernelset = my_modtool.get_current_kernels(base); ++++ for i in kernelset: ++++ print i; ++++ ++++ if options.list: ++++ kernelset = my_modtool.get_current_kernels(); ++++ for i in kernelset: ++++ print i; ++++ ++++ if options.kernels: ++++ dest = my_cfg.cfg.get(my_cfg.config_name, 'destination'); ++++ name = my_cfg.cfg.get(my_cfg.config_name, 'name'); ++++ base = os.path.join(dest, 'volk_gnsssdr_' + name); ++++ kernelset = my_modtool.get_current_kernels(base); ++++ for i in kernelset: ++++ print i; +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py +++--- /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,330 @@ ++++# ++++# Copyright 2013 Free Software Foundation, Inc. ++++# ++++# This file is part of GNU Radio ++++# ++++# GNU Radio is free software; you can redistribute it and/or modify ++++# it under the terms of the GNU General Public License as published by ++++# the Free Software Foundation; either version 3, or (at your option) ++++# any later version. ++++# ++++# GNU Radio is distributed in the hope that it will be useful, ++++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++++# GNU General Public License for more details. ++++# ++++# You should have received a copy of the GNU General Public License ++++# along with GNU Radio; see the file COPYING. If not, write to ++++# the Free Software Foundation, Inc., 51 Franklin Street, ++++# Boston, MA 02110-1301, USA. ++++# ++++ ++++import os ++++import glob ++++import sys ++++import re ++++import glob ++++import shutil ++++import exceptions ++++from sets import Set ++++ ++++class volk_gnsssdr_modtool: ++++ def __init__(self, cfg): ++++ self.volk_gnsssdr = re.compile('volk_gnsssdr'); ++++ self.remove_after_underscore = re.compile("_.*"); ++++ self.volk_gnsssdr_run_tests = re.compile('^\s*VOLK_RUN_TESTS.*\n', re.MULTILINE); ++++ self.volk_gnsssdr_profile = re.compile('^\s*(VOLK_PROFILE|VOLK_PUPPET_PROFILE).*\n', re.MULTILINE); ++++ self.my_dict = cfg; ++++ self.lastline = re.compile('\s*char path\[1024\];.*'); ++++ self.badassert = re.compile('^\s*assert\(toked\[0\] == "volk_gnsssdr_.*\n', re.MULTILINE); ++++ self.goodassert = ' assert(toked[0] == "volk_gnsssdr");\n' ++++ self.baderase = re.compile('^\s*toked.erase\(toked.begin\(\)\);.*\n', re.MULTILINE); ++++ self.gooderase = ' toked.erase(toked.begin());\n toked.erase(toked.begin());\n'; ++++ ++++ def get_basename(self, base=None): ++++ if not base: ++++ base = self.my_dict['base'] ++++ candidate = base.split('/')[-1]; ++++ if len(candidate.split('_')) == 1: ++++ return ''; ++++ else: ++++ return candidate.split('_')[-1]; ++++ ++++ def get_current_kernels(self, base=None): ++++ if not base: ++++ base = self.my_dict['base'] ++++ name = self.get_basename(); ++++ else: ++++ name = self.get_basename(base); ++++ if name == '': ++++ hdr_files = glob.glob(os.path.join(base, "kernels/volk_gnsssdr/*.h")); ++++ begins = re.compile("(?<=volk_gnsssdr_).*") ++++ else: ++++ hdr_files = glob.glob(os.path.join(base, "kernels/volk_gnsssdr_" + name + "/*.h")); ++++ begins = re.compile("(?<=volk_gnsssdr_" + name + "_).*") ++++ ++++ datatypes = []; ++++ functions = []; ++++ ++++ ++++ for line in hdr_files: ++++ ++++ subline = re.search(".*\.h.*", os.path.basename(line)) ++++ if subline: ++++ subsubline = begins.search(subline.group(0)); ++++ if subsubline: ++++ dtype = self.remove_after_underscore.sub("", subsubline.group(0)); ++++ subdtype = re.search("[0-9]+[A-z]+", dtype); ++++ if subdtype: ++++ datatypes.append(subdtype.group(0)); ++++ ++++ ++++ datatypes = set(datatypes); ++++ ++++ for line in hdr_files: ++++ for dt in datatypes: ++++ if dt in line: ++++ #subline = re.search("(?<=volk_gnsssdr_)" + dt + ".*(?=\.h)", line); ++++ subline = re.search(begins.pattern[:-2] + dt + ".*(?=\.h)", line); ++++ if subline: ++++ functions.append(subline.group(0)); ++++ ++++ return set(functions); ++++ ++++ def make_module_skeleton(self): ++++ ++++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name']) ++++ if os.path.exists(dest): ++++ raise exceptions.IOError("Destination %s already exits!"%(dest)); ++++ ++++ if not os.path.exists(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'kernels/volk_gnsssdr_' + self.my_dict['name'])): ++++ os.makedirs(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'kernels/volk_gnsssdr_' + self.my_dict['name'])) ++++ ++++ current_kernel_names = self.get_current_kernels(); ++++ ++++ for root, dirnames, filenames in os.walk(self.my_dict['base']): ++++ for name in filenames: ++++ t_table = map(lambda a: re.search(a, name), current_kernel_names); ++++ t_table = set(t_table); ++++ if t_table == set([None]): ++++ infile = os.path.join(root, name); ++++ instring = open(infile, 'r').read(); ++++ outstring = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring); ++++ newname = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], name); ++++ relpath = os.path.relpath(infile, self.my_dict['base']); ++++ newrelpath = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath); ++++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname); ++++ ++++ if not os.path.exists(os.path.dirname(dest)): ++++ os.makedirs(os.path.dirname(dest)) ++++ open(dest, 'w+').write(outstring); ++++ ++++ ++++ infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc'); ++++ instring = open(infile, 'r').read(); ++++ outstring = re.sub(self.volk_gnsssdr_run_tests, '', instring); ++++ open(infile, 'w+').write(outstring); ++++ ++++ infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc'); ++++ instring = open(infile, 'r').read(); ++++ outstring = re.sub(self.volk_gnsssdr_profile, '', instring); ++++ open(infile, 'w+').write(outstring); ++++ ++++ infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/qa_utils.cc'); ++++ instring = open(infile, 'r').read(); ++++ outstring = re.sub(self.badassert, self.goodassert, instring); ++++ outstring = re.sub(self.baderase, self.gooderase, outstring); ++++ open(infile, 'w+').write(outstring); ++++ ++++ def write_default_cfg(self, cfg): ++++ outfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'volk_gnsssdr_modtool.cfg'), 'wb'); ++++ cfg.write(outfile); ++++ outfile.close(); ++++ ++++ ++++ def convert_kernel(self, oldvolk_gnsssdr, name, base, inpath, top): ++++ infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + name + '.h'); ++++ instring = open(infile, 'r').read(); ++++ outstring = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring); ++++ newname = 'volk_gnsssdr_' + self.my_dict['name'] + '_' + name + '.h'; ++++ relpath = os.path.relpath(infile, base); ++++ newrelpath = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath); ++++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname); ++++ ++++ if not os.path.exists(os.path.dirname(dest)): ++++ os.makedirs(os.path.dirname(dest)) ++++ open(dest, 'w+').write(outstring); ++++ ++++ # copy orc proto-kernels if they exist ++++ for orcfile in glob.glob(inpath + '/orc/' + top + name + '*.orc'): ++++ if os.path.isfile(orcfile): ++++ instring = open(orcfile, 'r').read(); ++++ outstring = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring); ++++ newname = 'volk_gnsssdr_' + self.my_dict['name'] + '_' + name + '.orc'; ++++ relpath = os.path.relpath(orcfile, base); ++++ newrelpath = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath); ++++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname); ++++ if not os.path.exists(os.path.dirname(dest)): ++++ os.makedirs(os.path.dirname(dest)); ++++ open(dest, 'w+').write(outstring) ++++ ++++ ++++ def remove_kernel(self, name): ++++ basename = self.my_dict['name']; ++++ if len(basename) > 0: ++++ top = 'volk_gnsssdr_' + basename + '_'; ++++ else: ++++ top = 'volk_gnsssdr_' ++++ base = os.path.join(self.my_dict['destination'], top[:-1]) ; ++++ ++++ if not name in self.get_current_kernels(): ++++ ++++ raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base)); ++++ ++++ ++++ ++++ inpath = os.path.abspath(base); ++++ ++++ ++++ kernel = re.compile(name) ++++ search_kernels = Set([kernel]) ++++ profile = re.compile('^\s*VOLK_PROFILE') ++++ puppet = re.compile('^\s*VOLK_PUPPET') ++++ src_dest = os.path.join(inpath, 'apps/', top[:-1] + '_profile.cc'); ++++ infile = open(src_dest); ++++ otherlines = infile.readlines(); ++++ open(src_dest, 'w+').write(''); ++++ ++++ for otherline in otherlines: ++++ write_okay = True; ++++ if kernel.search(otherline): ++++ write_okay = False; ++++ if puppet.match(otherline): ++++ args = re.search("(?<=VOLK_PUPPET_PROFILE).*", otherline) ++++ m_func = args.group(0).split(',')[0]; ++++ func = re.search('(?<=' + top + ').*', m_func); ++++ search_kernels.add(re.compile(func.group(0))); ++++ if write_okay: ++++ open(src_dest, 'a').write(otherline); ++++ ++++ ++++ src_dest = os.path.join(inpath, 'lib/testqa.cc') ++++ infile = open(src_dest); ++++ otherlines = infile.readlines(); ++++ open(src_dest, 'w+').write(''); ++++ ++++ for otherline in otherlines: ++++ write_okay = True; ++++ ++++ for kernel in search_kernels: ++++ if kernel.search(otherline): ++++ write_okay = False; ++++ ++++ if write_okay: ++++ open(src_dest, 'a').write(otherline); ++++ ++++ for kernel in search_kernels: ++++ infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + kernel.pattern + '.h'); ++++ print "Removing kernel %s"%(kernel.pattern) ++++ if os.path.exists(infile): ++++ os.remove(infile); ++++ # remove the orc proto-kernels if they exist. There are no puppets here ++++ # so just need to glob for files matching kernel name ++++ print glob.glob(inpath + '/orc/' + top + name + '*.orc'); ++++ for orcfile in glob.glob(inpath + '/orc/' + top + name + '*.orc'): ++++ print orcfile ++++ if(os.path.exists(orcfile)): ++++ os.remove(orcfile); ++++ ++++ def import_kernel(self, name, base): ++++ if not (base): ++++ base = self.my_dict['base']; ++++ basename = self.getbasename(); ++++ else: ++++ basename = self.get_basename(base); ++++ if not name in self.get_current_kernels(base): ++++ raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base)); ++++ ++++ inpath = os.path.abspath(base); ++++ if len(basename) > 0: ++++ top = 'volk_gnsssdr_' + basename + '_'; ++++ else: ++++ top = 'volk_gnsssdr_' ++++ oldvolk_gnsssdr = re.compile(top[:-1]); ++++ ++++ self.convert_kernel(oldvolk_gnsssdr, name, base, inpath, top); ++++ ++++ kernel = re.compile(name) ++++ search_kernels = Set([kernel]) ++++ ++++ profile = re.compile('^\s*VOLK_PROFILE') ++++ puppet = re.compile('^\s*VOLK_PUPPET') ++++ infile = open(os.path.join(inpath, 'apps/', oldvolk_gnsssdr.pattern + '_profile.cc')); ++++ otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc')); ++++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc'); ++++ lines = infile.readlines(); ++++ otherlines = otherinfile.readlines(); ++++ open(dest, 'w+').write(''); ++++ insert = False; ++++ inserted = False ++++ for otherline in otherlines: ++++ ++++ if self.lastline.match(otherline): ++++ insert = True; ++++ if insert and not inserted: ++++ inserted = True; ++++ for line in lines: ++++ if kernel.search(line): ++++ if profile.match(line): ++++ outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line); ++++ open(dest, 'a').write(outline); ++++ elif puppet.match(line): ++++ outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line); ++++ open(dest, 'a').write(outline); ++++ args = re.search("(?<=VOLK_PUPPET_PROFILE).*", line) ++++ m_func = args.group(0).split(',')[0]; ++++ func = re.search('(?<=' + top + ').*', m_func); ++++ search_kernels.add(re.compile(func.group(0))); ++++ self.convert_kernel(oldvolk_gnsssdr, func.group(0), base, inpath, top); ++++ write_okay = True; ++++ for kernel in search_kernels: ++++ if kernel.search(otherline): ++++ write_okay = False ++++ if write_okay: ++++ open(dest, 'a').write(otherline); ++++ ++++ for kernel in search_kernels: ++++ print "Adding kernel %s from module %s"%(kernel.pattern,base) ++++ ++++ infile = open(os.path.join(inpath, 'lib/testqa.cc')); ++++ otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc')); ++++ dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc'); ++++ lines = infile.readlines(); ++++ otherlines = otherinfile.readlines(); ++++ open(dest, 'w+').write(''); ++++ inserted = False; ++++ insert = False ++++ for otherline in otherlines: ++++ ++++ if (re.match('\s*', otherline) == None or re.match('\s*#.*', otherline) == None): ++++ ++++ insert = True; ++++ if insert and not inserted: ++++ inserted = True; ++++ for line in lines: ++++ for kernel in search_kernels: ++++ if kernel.search(line): ++++ if self.volk_gnsssdr_run_tests.match(line): ++++ outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line); ++++ open(dest, 'a').write(outline); ++++ write_okay = True; ++++ for kernel in search_kernels: ++++ if kernel.search(otherline): ++++ write_okay = False ++++ if write_okay: ++++ open(dest, 'a').write(otherline); ++++ ++++ ++++ ++++ ++++ +++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc differ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_VOLK_RUNTIME +++-#define INCLUDED_VOLK_RUNTIME ++++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME ++++#define INCLUDED_VOLK_GNSSSDR_RUNTIME +++ +++ #include +++ #include +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H +++-#define INCLUDED_VOLK_CONFIG_FIXED_H ++++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H ++++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H +++ +++ #for $i, $arch in enumerate($archs) +++ #define LV_$(arch.name.upper()) $i +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_VOLK_CPU_H +++-#define INCLUDED_VOLK_CPU_H ++++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H ++++#define INCLUDED_VOLK_GNSSSDR_CPU_H +++ +++ #include +++ +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_LIBVOLK_MACHINES_H +++-#define INCLUDED_LIBVOLK_MACHINES_H ++++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H ++++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +++ +++ #include +++ #include +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h +++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 03:00:41.000000000 +0200 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-15 01:55:08.000000000 +0200 +++@@ -19,8 +19,8 @@ +++ * Boston, MA 02110-1301, USA. +++ */ +++ +++-#ifndef INCLUDED_VOLK_TYPEDEFS +++-#define INCLUDED_VOLK_TYPEDEFS ++++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS ++++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS +++ +++ #include +++ #include +++diff -rupN /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg +++--- /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg 1970-01-01 01:00:00.000000000 +0100 ++++++ /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg 2014-10-15 01:55:08.000000000 +0200 +++@@ -0,0 +1,5 @@ ++++[config] ++++name = gnsssdr ++++destination = /Users/andres/Github/gnss-sdr/src/algorithms/libs ++++base = /Users/andres/github/gnuradio/volk ++++ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/README.txt ++--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt 1970-01-01 01:00:00.000000000 +0100 +++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/README.txt 2014-10-17 04:26:17.000000000 +0200 ++@@ -0,0 +1,71 @@ +++######################################################################## +++# Patching original volk module +++######################################################################## +++In order to fit the GNSS-SDR needs, the original volk module must be patched. +++ +++The folder containing this file has some patches to automatize the process and +++modify the files quickly. To apply them you will need to run the following command: +++$ patch -p5 < /Path/Of/The/Patch/nameOfThePatch.patch +++ +++The number after “-p” may change, read the patch documentation for more help. +++ +++You may need this information if you want to recreate the volk_gnsssdr module again +++or you want to update the volk_gnsssdr module with the improvements introduced by GNURadio. +++ +++######################################################################## +++######################################################################## +++# Operations apply by the patches and other information (not needed if you know how to apply the patches!!!) +++######################################################################## +++######################################################################## +++ +++To create the volk module you will need to follow the following steps: +++In order to understand and follow the creation and setup of the volk_gnsssdr module I will use some absolute paths: /Users/andres/Github/gnuradio => a cloned repository of the GNURadio project. /Users/andres/Github/gnss-sdr => a cloned repository of the GNSS- SDR project. +++ +++######################################################################## +++#FIRST STEP: using volk_modtool to create a new volk module +++######################################################################## +++GNURadio offers a tool called volk_modtool to create and manage new volk modules and their proto-kernels. The steps to create the volk_gnsssdr module are: +++ +++1) Export the PYTHONPATH, that indicates where volk_modtool is: +++$ export PYTHONPATH=/Users/andres/Github/gnuradio/volk/python +++ +++2) Go to the folder where volk_modtool executable is: $ cd /Users/andres/Github/gnuradio/volk/python/volk_modtool +++ +++3) Execute volk_modtool indicating that we want to create a new volk module (-i): $ ./volk_modtool -i +++ +++4) volk_modtool will ask us about the name of the newly created module, the destination folder where you want to store it and the base module (the base module is the volk module inside the GNURadio project): name: gnsssdr destination: /Users/andres/Github/gnss-sdr/src/algorithms/libs base: /Users/andres/github/gnuradio/volk +++ +++######################################################################## +++#SECOND STEP: add proto-kernels to the module +++######################################################################## +++After creating the module you will need to add some proto-kernels to it. To accomplish it you will need to: 1) Copy your proto-kernels inside the /kernels folder. Copy the ORC implementations inside the /orc folder. Copy the macros implementations inside the /kernels/CommonMacros folder. (those folders are found in the root of the volk_gnsssdr module) +++ 2) Add one profiling line for each of the proto-kernels inside the /apps/volk_gnsssdr_profile.cc file. +++ +++3) Add one test line for each of the proto-kernels inside the /lib/testqa.cc file. ######################################################################## +++#THIRD STEP: modifications to allow profiling of some proto-kernels with special parameters +++######################################################################## Some of the proto-kernels that GNSS-SDR needs are not supported by the profiling environment of the volk_gnsssdr module. In order to profile them some modifications need to be done to two files: 1) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.cc At the first part of this file there are defined the parameters supported by the environment. The number after run_cast_test indicates the total number of parameters passed to the proto-kernel (input +output parameters). The other part indicates the type of the data passed. Inside func(....) you will need to add the same number of buffs[ ] that the one specified after run_cast_test. +++ +++2) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.h In the header you will need to add typedefs for the new definitions made in the .cc file. Take care: you will need to add the same number of void * that the one specified after run_cast_test. +++ +++######################################################################## +++#FOURTH STEP: optional modifications +++######################################################################## +++1) Modify /src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt in order to see kernel files, ORC files and macros when generating the IDE project. +++ +++2) To be able to use volk_gnsssdr and default volk functions at the same time i n the same file you will need to modify the template files that volk_gnsssdr module uses at build time to generate some headers. +++The files modified are found inside /tmpl: volk_gnsssdr.tmpl.h +++volk_gnsssdr_typedefs.tmpl.h +++volk_gnsssdr_machines.tmpl.h +++volk_gnsssdr_cpu.tmpl.h +++volk_gnsssdr_config_fixed.tmpl.h The modifications consist of changing the defines of those files to different ones to allow the definition of the volk_gnsssdr functions although the default volk functions are already defined. +++ +++######################################################################## +++#FIFTH STEP: add volk_gnsssdr module to the GNSS-SDR project +++######################################################################## +++In order to add the volk_gnsssdr module to the GNSS-SDR project the CMakeLists.txt global file needs to be edited. +++ +++######################################################################## +++#SIXTH STEP: using volk_gnsssdr functions +++######################################################################## +++To use the proto-kernels inside volk_gnsssdr project two steps are needed: 1) in the CMakeFiles.txt you will need to add $ {VOLK_GNSSSDR_INCLUDE_DIRS} inside the include_directories function, and also add $ {VOLK_GNSSSDR_LIBRARIES} inside the target_link_libraries function. +++ 2) Add the line #include “volk_gnsssdr.h” at the top of the file. ++\ No newline at end of file ++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.pyc differ ++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.pyc differ ++Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc differ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:23:30.000000000 +0200 ++@@ -19,8 +19,8 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_VOLK_RUNTIME ++-#define INCLUDED_VOLK_RUNTIME +++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME +++#define INCLUDED_VOLK_GNSSSDR_RUNTIME ++ ++ #include ++ #include ++@@ -91,4 +91,4 @@ extern VOLK_API volk_gnsssdr_func_desc_t ++ ++ __VOLK_DECL_END ++ ++-#endif /*INCLUDED_VOLK_RUNTIME*/ +++#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:22:58.000000000 +0200 ++@@ -19,11 +19,11 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H ++-#define INCLUDED_VOLK_CONFIG_FIXED_H +++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H +++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H ++ ++ #for $i, $arch in enumerate($archs) ++ #define LV_$(arch.name.upper()) $i ++ #end for ++ ++-#endif /*INCLUDED_VOLK_CONFIG_FIXED*/ +++#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:23:07.000000000 +0200 ++@@ -19,8 +19,8 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_VOLK_CPU_H ++-#define INCLUDED_VOLK_CPU_H +++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H +++#define INCLUDED_VOLK_GNSSSDR_CPU_H ++ ++ #include ++ ++@@ -39,4 +39,4 @@ unsigned int volk_gnsssdr_get_lvarch (); ++ ++ __VOLK_DECL_END ++ ++-#endif /*INCLUDED_VOLK_CPU_H*/ +++#endif /*INCLUDED_VOLK_GNSSSDR_CPU_H*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:23:16.000000000 +0200 ++@@ -19,8 +19,8 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_LIBVOLK_MACHINES_H ++-#define INCLUDED_LIBVOLK_MACHINES_H +++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H ++ ++ #include ++ #include ++@@ -52,4 +52,4 @@ extern struct volk_gnsssdr_machine volk_ ++ ++ __VOLK_DECL_END ++ ++-#endif //INCLUDED_LIBVOLK_MACHINES_H +++#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h ++--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:23:23.000000000 +0200 ++@@ -19,8 +19,8 @@ ++ * Boston, MA 02110-1301, USA. ++ */ ++ ++-#ifndef INCLUDED_VOLK_TYPEDEFS ++-#define INCLUDED_VOLK_TYPEDEFS +++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS +++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS ++ ++ #include ++ #include ++@@ -29,4 +29,4 @@ ++ typedef void (*$(kern.pname))($kern.arglist_types); ++ #end for ++ ++-#endif /*INCLUDED_VOLK_TYPEDEFS*/ +++#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/ ++diff -rupN /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg ++--- /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg 2014-10-17 04:26:39.000000000 +0200 +++++ /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg 2014-10-15 01:55:08.000000000 +0200 ++@@ -1,5 +1,5 @@ ++ [config] ++ name = gnsssdr ++-destination = /Users/andres/Github +++destination = /Users/andres/Github/gnss-sdr/src/algorithms/libs ++ base = /Users/andres/github/gnuradio/volk ++ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/README.txt +--- /Users/andres/Desktop/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt 1970-01-01 01:00:00.000000000 +0100 ++++ /Users/andres/Desktop/volk_gnsssdr_original/patches for generating volk_gnsssdr/README.txt 2014-10-17 04:26:17.000000000 +0200 +@@ -0,0 +1,71 @@ ++######################################################################## ++# Patching original volk module ++######################################################################## ++In order to fit the GNSS-SDR needs, the original volk module must be patched. ++ ++The folder containing this file has some patches to automatize the process and ++modify the files quickly. To apply them you will need to run the following command: ++$ patch -p5 < /Path/Of/The/Patch/nameOfThePatch.patch ++ ++The number after “-p” may change, read the patch documentation for more help. ++ ++You may need this information if you want to recreate the volk_gnsssdr module again ++or you want to update the volk_gnsssdr module with the improvements introduced by GNURadio. ++ ++######################################################################## ++######################################################################## ++# Operations apply by the patches and other information (not needed if you know how to apply the patches!!!) ++######################################################################## ++######################################################################## ++ ++To create the volk module you will need to follow the following steps: ++In order to understand and follow the creation and setup of the volk_gnsssdr module I will use some absolute paths: /Users/andres/Github/gnuradio => a cloned repository of the GNURadio project. /Users/andres/Github/gnss-sdr => a cloned repository of the GNSS- SDR project. ++ ++######################################################################## ++#FIRST STEP: using volk_modtool to create a new volk module ++######################################################################## ++GNURadio offers a tool called volk_modtool to create and manage new volk modules and their proto-kernels. The steps to create the volk_gnsssdr module are: ++ ++1) Export the PYTHONPATH, that indicates where volk_modtool is: ++$ export PYTHONPATH=/Users/andres/Github/gnuradio/volk/python ++ ++2) Go to the folder where volk_modtool executable is: $ cd /Users/andres/Github/gnuradio/volk/python/volk_modtool ++ ++3) Execute volk_modtool indicating that we want to create a new volk module (-i): $ ./volk_modtool -i ++ ++4) volk_modtool will ask us about the name of the newly created module, the destination folder where you want to store it and the base module (the base module is the volk module inside the GNURadio project): name: gnsssdr destination: /Users/andres/Github/gnss-sdr/src/algorithms/libs base: /Users/andres/github/gnuradio/volk ++ ++######################################################################## ++#SECOND STEP: add proto-kernels to the module ++######################################################################## ++After creating the module you will need to add some proto-kernels to it. To accomplish it you will need to: 1) Copy your proto-kernels inside the /kernels folder. Copy the ORC implementations inside the /orc folder. Copy the macros implementations inside the /kernels/CommonMacros folder. (those folders are found in the root of the volk_gnsssdr module) ++ 2) Add one profiling line for each of the proto-kernels inside the /apps/volk_gnsssdr_profile.cc file. ++ ++3) Add one test line for each of the proto-kernels inside the /lib/testqa.cc file. ######################################################################## ++#THIRD STEP: modifications to allow profiling of some proto-kernels with special parameters ++######################################################################## Some of the proto-kernels that GNSS-SDR needs are not supported by the profiling environment of the volk_gnsssdr module. In order to profile them some modifications need to be done to two files: 1) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.cc At the first part of this file there are defined the parameters supported by the environment. The number after run_cast_test indicates the total number of parameters passed to the proto-kernel (input +output parameters). The other part indicates the type of the data passed. Inside func(....) you will need to add the same number of buffs[ ] that the one specified after run_cast_test. ++ ++2) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.h In the header you will need to add typedefs for the new definitions made in the .cc file. Take care: you will need to add the same number of void * that the one specified after run_cast_test. ++ ++######################################################################## ++#FOURTH STEP: optional modifications ++######################################################################## ++1) Modify /src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt in order to see kernel files, ORC files and macros when generating the IDE project. ++ ++2) To be able to use volk_gnsssdr and default volk functions at the same time i n the same file you will need to modify the template files that volk_gnsssdr module uses at build time to generate some headers. ++The files modified are found inside /tmpl: volk_gnsssdr.tmpl.h ++volk_gnsssdr_typedefs.tmpl.h ++volk_gnsssdr_machines.tmpl.h ++volk_gnsssdr_cpu.tmpl.h ++volk_gnsssdr_config_fixed.tmpl.h The modifications consist of changing the defines of those files to different ones to allow the definition of the volk_gnsssdr functions although the default volk functions are already defined. ++ ++######################################################################## ++#FIFTH STEP: add volk_gnsssdr module to the GNSS-SDR project ++######################################################################## ++In order to add the volk_gnsssdr module to the GNSS-SDR project the CMakeLists.txt global file needs to be edited. ++ ++######################################################################## ++#SIXTH STEP: using volk_gnsssdr functions ++######################################################################## ++To use the proto-kernels inside volk_gnsssdr project two steps are needed: 1) in the CMakeFiles.txt you will need to add $ {VOLK_GNSSSDR_INCLUDE_DIRS} inside the include_directories function, and also add $ {VOLK_GNSSSDR_LIBRARIES} inside the target_link_libraries function. ++ 2) Add the line #include “volk_gnsssdr.h” at the top of the file. +\ No newline at end of file +Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/__init__.pyc differ +Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/cfg.pyc differ +Binary files /Users/andres/Desktop/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc and /Users/andres/Desktop/volk_gnsssdr_original/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc differ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr.tmpl.h 2014-10-17 04:23:30.000000000 +0200 +@@ -19,8 +19,8 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_VOLK_RUNTIME +-#define INCLUDED_VOLK_RUNTIME ++#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME ++#define INCLUDED_VOLK_GNSSSDR_RUNTIME + + #include + #include +@@ -91,4 +91,4 @@ extern VOLK_API volk_gnsssdr_func_desc_t + + __VOLK_DECL_END + +-#endif /*INCLUDED_VOLK_RUNTIME*/ ++#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_config_fixed.tmpl.h 2014-10-17 04:22:58.000000000 +0200 +@@ -19,11 +19,11 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_VOLK_CONFIG_FIXED_H +-#define INCLUDED_VOLK_CONFIG_FIXED_H ++#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H ++#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H + + #for $i, $arch in enumerate($archs) + #define LV_$(arch.name.upper()) $i + #end for + +-#endif /*INCLUDED_VOLK_CONFIG_FIXED*/ ++#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_cpu.tmpl.h 2014-10-17 04:23:07.000000000 +0200 +@@ -19,8 +19,8 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_VOLK_CPU_H +-#define INCLUDED_VOLK_CPU_H ++#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H ++#define INCLUDED_VOLK_GNSSSDR_CPU_H + + #include + +@@ -39,4 +39,4 @@ unsigned int volk_gnsssdr_get_lvarch (); + + __VOLK_DECL_END + +-#endif /*INCLUDED_VOLK_CPU_H*/ ++#endif /*INCLUDED_VOLK_GNSSSDR_CPU_H*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_machines.tmpl.h 2014-10-17 04:23:16.000000000 +0200 +@@ -19,8 +19,8 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_LIBVOLK_MACHINES_H +-#define INCLUDED_LIBVOLK_MACHINES_H ++#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H ++#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H + + #include + #include +@@ -52,4 +52,4 @@ extern struct volk_gnsssdr_machine volk_ + + __VOLK_DECL_END + +-#endif //INCLUDED_LIBVOLK_MACHINES_H ++#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +diff -rupN /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h +--- /Users/andres/Desktop/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/tmpl/volk_gnsssdr_typedefs.tmpl.h 2014-10-17 04:23:23.000000000 +0200 +@@ -19,8 +19,8 @@ + * Boston, MA 02110-1301, USA. + */ + +-#ifndef INCLUDED_VOLK_TYPEDEFS +-#define INCLUDED_VOLK_TYPEDEFS ++#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS ++#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS + + #include + #include +@@ -29,4 +29,4 @@ + typedef void (*$(kern.pname))($kern.arglist_types); + #end for + +-#endif /*INCLUDED_VOLK_TYPEDEFS*/ ++#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/ +diff -rupN /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg +--- /Users/andres/Desktop/volk_gnsssdr/volk_modtool.cfg 2014-10-17 05:07:25.000000000 +0200 ++++ /Users/andres/Desktop/volk_gnsssdr_original/volk_modtool.cfg 2014-10-15 01:55:08.000000000 +0200 +@@ -1,5 +1,5 @@ + [config] + name = gnsssdr +-destination = /Users/andres/Github ++destination = /Users/andres/Github/gnss-sdr/src/algorithms/libs + base = /Users/andres/github/gnuradio/volk + diff --git a/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt b/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt new file mode 100644 index 000000000..1c2bc178f --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/patches for generating volk_gnsssdr/README.txt @@ -0,0 +1,71 @@ +######################################################################## +# Patching original volk module +######################################################################## +In order to fit the GNSS-SDR needs, the original volk module must be patched. + +The folder containing this file has some patches to automatize the process and +modify the files quickly. To apply them you will need to run the following command: +$ patch -p5 < /Path/Of/The/Patch/nameOfThePatch.patch + +The number after “-p” may change, read the patch documentation for more help. + +You may need this information if you want to recreate the volk_gnsssdr module again +or you want to update the volk_gnsssdr module with the improvements introduced by GNURadio. + +######################################################################## +######################################################################## +# Operations apply by the patches and other information (not needed if you know how to apply the patches!!!) +######################################################################## +######################################################################## + +To create the volk module you will need to follow the following steps: +In order to understand and follow the creation and setup of the volk_gnsssdr module I will use some absolute paths: /Users/andres/Github/gnuradio => a cloned repository of the GNURadio project. /Users/andres/Github/gnss-sdr => a cloned repository of the GNSS- SDR project. + +######################################################################## +#FIRST STEP: using volk_modtool to create a new volk module +######################################################################## +GNURadio offers a tool called volk_modtool to create and manage new volk modules and their proto-kernels. The steps to create the volk_gnsssdr module are: + +1) Export the PYTHONPATH, that indicates where volk_modtool is: +$ export PYTHONPATH=/Users/andres/Github/gnuradio/volk/python + +2) Go to the folder where volk_modtool executable is: $ cd /Users/andres/Github/gnuradio/volk/python/volk_modtool + +3) Execute volk_modtool indicating that we want to create a new volk module (-i): $ ./volk_modtool -i + +4) volk_modtool will ask us about the name of the newly created module, the destination folder where you want to store it and the base module (the base module is the volk module inside the GNURadio project): name: gnsssdr destination: /Users/andres/Github/gnss-sdr/src/algorithms/libs base: /Users/andres/github/gnuradio/volk + +######################################################################## +#SECOND STEP: add proto-kernels to the module +######################################################################## +After creating the module you will need to add some proto-kernels to it. To accomplish it you will need to: 1) Copy your proto-kernels inside the /kernels folder. Copy the ORC implementations inside the /orc folder. Copy the macros implementations inside the /kernels/CommonMacros folder. (those folders are found in the root of the volk_gnsssdr module) + 2) Add one profiling line for each of the proto-kernels inside the /apps/volk_gnsssdr_profile.cc file. + +3) Add one test line for each of the proto-kernels inside the /lib/testqa.cc file. ######################################################################## +#THIRD STEP: modifications to allow profiling of some proto-kernels with special parameters +######################################################################## Some of the proto-kernels that GNSS-SDR needs are not supported by the profiling environment of the volk_gnsssdr module. In order to profile them some modifications need to be done to two files: 1) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.cc At the first part of this file there are defined the parameters supported by the environment. The number after run_cast_test indicates the total number of parameters passed to the proto-kernel (input +output parameters). The other part indicates the type of the data passed. Inside func(....) you will need to add the same number of buffs[ ] that the one specified after run_cast_test. + +2) Modify /src/algorithms/libs/volk_gnsssdr/lib/qa_utils.h In the header you will need to add typedefs for the new definitions made in the .cc file. Take care: you will need to add the same number of void * that the one specified after run_cast_test. + +######################################################################## +#FOURTH STEP: optional modifications +######################################################################## +1) Modify /src/algorithms/libs/volk_gnsssdr/lib/CMakeLists.txt in order to see kernel files, ORC files and macros when generating the IDE project. + +2) To be able to use volk_gnsssdr and default volk functions at the same time i n the same file you will need to modify the template files that volk_gnsssdr module uses at build time to generate some headers. +The files modified are found inside /tmpl: volk_gnsssdr.tmpl.h +volk_gnsssdr_typedefs.tmpl.h +volk_gnsssdr_machines.tmpl.h +volk_gnsssdr_cpu.tmpl.h +volk_gnsssdr_config_fixed.tmpl.h The modifications consist of changing the defines of those files to different ones to allow the definition of the volk_gnsssdr functions although the default volk functions are already defined. + +######################################################################## +#FIFTH STEP: add volk_gnsssdr module to the GNSS-SDR project +######################################################################## +In order to add the volk_gnsssdr module to the GNSS-SDR project the CMakeLists.txt global file needs to be edited. + +######################################################################## +#SIXTH STEP: using volk_gnsssdr functions +######################################################################## +To use the proto-kernels inside volk_gnsssdr project two steps are needed: 1) in the CMakeFiles.txt you will need to add $ {VOLK_GNSSSDR_INCLUDE_DIRS} inside the include_directories function, and also add $ {VOLK_GNSSSDR_LIBRARIES} inside the target_link_libraries function. + 2) Add the line #include “volk_gnsssdr.h” at the top of the file. \ No newline at end of file diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/CMakeLists.txt b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/CMakeLists.txt new file mode 100644 index 000000000..bba4d3664 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/CMakeLists.txt @@ -0,0 +1,39 @@ +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. + +######################################################################## +# Install python files and apps +######################################################################## +include(GrPython) + +VOLK_PYTHON_INSTALL( + FILES + __init__.py + cfg.py + volk_gnsssdr_modtool_generate.py + DESTINATION ${VOLK_PYTHON_DIR}/volk_gnsssdr_modtool + COMPONENT "volk_gnsssdr" +) + +VOLK_PYTHON_INSTALL( + PROGRAMS + volk_gnsssdr_modtool + DESTINATION ${VOLK_RUNTIME_DIR} + COMPONENT "volk_gnsssdr" +) diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/README b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/README new file mode 100644 index 000000000..3820201c2 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/README @@ -0,0 +1,114 @@ +The volk_gnsssdr_modtool tool is installed along with VOLK as a way of helping +to construct, add to, and interogate the VOLK library or companion +libraries. + +volk_gnsssdr_modtool is installed into $prefix/bin. + +VOLK modtool enables creating standalone (out-of-tree) VOLK modules +and provides a few tools for sharing VOLK kernels between VOLK +modules. If you need to design or work with VOLK kernels away from +the canonical VOLK library, this is the tool. If you need to tailor +your own VOLK library for whatever reason, this is the tool. + +The canonical VOLK library installs a volk_gnsssdr.h and a libvolk_gnsssdr.so. Your +own library will install volk_gnsssdr_$name.h and libvolk_gnsssdr_$name.so. Ya Gronk? +Good. + +There isn't a substantial difference between the canonical VOLK +module and any other VOLK module. They're all peers. Any module +created via VOLK modtool will come complete with a default +volk_gnsssdr_modtool.cfg file associating the module with the base from which +it came, its distinctive $name and its destination (or path). These +values (created from user input if VOLK modtool runs without a +user-supplied config file or a default config file) serve as default +values for some VOLK modtool actions. It's more or less intended for +the user to change directories to the top level of a created VOLK +module and then run volk_gnsssdr_modtool to take advantage of the values +stored in the default volk_gnsssdr_modtool.cfg file. + +Apart from creating new VOLK modules, VOLK modtool allows you to list +the names of kernels in other modules, list the names of kernels in +the current module, add kernels from another module into the current +module, and remove kernels from the current module. When moving +kernels between modules, VOLK modtool does its best to keep the qa +and profiling code for those kernels intact. If the base has a test +or a profiling call for some kernel, those calls will follow the +kernel when VOLK modtool adds that kernel. If QA or profiling +requires a puppet kernel, the puppet kernel will follow the original +kernel when VOLK modtool adds that original kernel. VOLK modtool +respects puppets. + +====================================================================== + +Installing a new VOLK Library: + +Run the command "volk_gnsssdr_modtool -i". This will ask you three questions: + + name: // the name to give your VOLK library: volk_gnsssdr_ + destination: // directory new source tree is built under -- must exists. + // It will create /volk_gnsssdr_ + base: // the directory containing the original VOLK source code + +The name provided must be alphanumeric (and cannot start with a +number). No special characters including dashes and underscores are +allowed. + +This will build a new skeleton directory in the destination provided +with the name volk_gnsssdr_. It will contain the necessary structure to +build: + + mkdir build + cd build + cmake -DCMAKE_INSTALL_PREFIX=/opt/volk_gnsssdr ../ + make + sudo make install + +Right now, the library is empty and contains no kernels. Kernels can +be added from another VOLK library using the '-a' option. If not +specified, the kernel will be extracted from the base VOLK +directory. Using the '-b' allows us to specify another VOLK library to +use for this purpose. + + volk_gnsssdr_modtool -a -n 32fc_x2_conjugate_dot_prod_32fc + +This will put the code for the new kernel into +/volk_gnsssdr_/kernels/volk_gnsssdr_/ + +Other kernels must be added by hand. See the following webpages for +more information about creating VOLK kernels: + http://gnuradio.org/doc/doxygen/volk_gnsssdr_guide.html + http://gnuradio.org/redmine/projects/gnuradio/wiki/Volk + + +====================================================================== + +OPTIONS + +Options for Adding and Removing Kernels: + -a, --add_kernel + Add kernel from existing VOLK module. Uses the base VOLK module + unless -b is used. Use -n to specify the kernel name. + Requires: -n. + Optional: -b + + -A, --add_all_kernels + Add all kernels from existing VOLK module. Uses the base VOLK + module unless -b is used. + Optional: -b + + -x, --remove_kernel + Remove kernel from module. + Required: -n. + Optional: -b + +Options for Listing Kernels: + -l, --list + Lists all kernels available in the base VOLK module. + + -k, --kernels + Lists all kernels in this VOLK module. + + -r, --remote-list + Lists all kernels in another VOLK module that is specified + using the -b option. + diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.py b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.py new file mode 100644 index 000000000..1d8fc6a3d --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +from cfg import volk_gnsssdr_modtool_config +from volk_gnsssdr_modtool_generate import volk_gnsssdr_modtool diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc new file mode 100644 index 000000000..bb525bb1a Binary files /dev/null and b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/__init__.pyc differ diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.py b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.py new file mode 100644 index 000000000..aa2ffbfdd --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +import ConfigParser +import sys +import os +import exceptions +import re + + +class volk_gnsssdr_modtool_config: + def key_val_sub(self, num, stuff, section): + return re.sub('\$' + 'k' + str(num), stuff[num][0], (re.sub('\$' + str(num), stuff[num][1], section[1][num]))); + + def verify(self): + for i in self.verification: + self.verify_section(i) + def remap(self): + for i in self.remapification: + self.verify_section(i) + + def verify_section(self, section): + stuff = self.cfg.items(section[0]) + for i in range(len(section[1])): + eval(self.key_val_sub(i, stuff, section)) + try: + val = eval(self.key_val_sub(i, stuff, section)) + if val == False: + raise exceptions.ValueError + except ValueError: + raise exceptions.ValueError('Verification function returns False... key:%s, val:%s'%(stuff[i][0], stuff[i][1])) + except: + raise exceptions.IOError('bad configuration... key:%s, val:%s'%(stuff[i][0], stuff[i][1])) + + + def __init__(self, cfg=None): + self.config_name = 'config' + self.config_defaults = ['name', 'destination', 'base'] + self.config_defaults_remap = ['1', + 'self.cfg.set(self.config_name, \'$k1\', os.path.realpath(os.path.expanduser(\'$1\')))', + 'self.cfg.set(self.config_name, \'$k2\', os.path.realpath(os.path.expanduser(\'$2\')))'] + + self.config_defaults_verify = ['re.match(\'[a-zA-Z0-9]+$\', \'$0\')', + 'os.path.exists(\'$1\')', + 'os.path.exists(\'$2\')'] + self.remapification = [(self.config_name, self.config_defaults_remap)] + self.verification = [(self.config_name, self.config_defaults_verify)] + default = os.path.join(os.getcwd(), 'volk_gnsssdr_modtool.cfg') + icfg = ConfigParser.RawConfigParser() + if cfg: + icfg.read(cfg) + elif os.path.exists(default): + icfg.read(default) + else: + print "Initializing config file..." + icfg.add_section(self.config_name) + for kn in self.config_defaults: + rv = raw_input("%s: "%(kn)) + icfg.set(self.config_name, kn, rv) + self.cfg = icfg + self.remap() + self.verify() + + + + def read_map(self, name, inp): + if self.cfg.has_section(name): + self.cfg.remove_section(name) + self.cfg.add_section(name) + for i in inp: + self.cfg.set(name, i, inp[i]) + + def get_map(self, name): + retval = {} + stuff = self.cfg.items(name) + for i in stuff: + retval[i[0]] = i[1] + return retval + + + + + + + diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc new file mode 100644 index 000000000..f3688fabf Binary files /dev/null and b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/cfg.pyc differ diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool new file mode 100644 index 000000000..304aad4ca --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool @@ -0,0 +1,128 @@ +#!/usr/bin/env python +# +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +from volk_gnsssdr_modtool import volk_gnsssdr_modtool, volk_gnsssdr_modtool_config +from optparse import OptionParser, OptionGroup + +import exceptions +import os +import sys + +if __name__ == '__main__': + parser = OptionParser(); + actions = OptionGroup(parser, 'Actions'); + actions.add_option('-i', '--install', action='store_true', + help='Create a new volk_gnsssdr module.') + parser.add_option('-b', '--base_path', action='store', default=None, + help='Base path for action. By default, volk_gnsssdr_modtool.cfg loads this value.') + parser.add_option('-n', '--kernel_name', action='store', default=None, + help='Kernel name for action. No default') + parser.add_option('-c', '--config', action='store', dest='config_file', default=None, + help='Config file for volk_gnsssdr_modtool. By default, volk_gnsssdr_modtool.cfg in the local directory will be used/created.') + actions.add_option('-a', '--add_kernel', action='store_true', + help='Add kernel from existing volk_gnsssdr module. Requires: -n. Optional: -b') + actions.add_option('-A', '--add_all_kernels', action='store_true', + help='Add all kernels from existing volk_gnsssdr module. Optional: -b') + actions.add_option('-x', '--remove_kernel', action='store_true', + help='Remove kernel from module. Required: -n. Optional: -b') + actions.add_option('-l', '--list', action='store_true', + help='List all kernels in the base.') + actions.add_option('-k', '--kernels', action='store_true', + help='List all kernels in the module.') + actions.add_option('-r', '--remote_list', action='store_true', + help='List all available kernels in remote volk_gnsssdr module. Requires: -b.') + actions.add_option('-m', '--moo', action='store_true', + help='Have you mooed today?') + parser.add_option_group(actions) + + (options, args) = parser.parse_args(); + if len(sys.argv) < 2: + parser.print_help() + + elif options.moo: + print " (__) " + print " (oo) " + print " /------\/ " + print " / | || " + print " * /\---/\ " + print " ~~ ~~ " + + else: + my_cfg = volk_gnsssdr_modtool_config(options.config_file); + + my_modtool = volk_gnsssdr_modtool(my_cfg.get_map(my_cfg.config_name)); + + + if options.install: + my_modtool.make_module_skeleton(); + my_modtool.write_default_cfg(my_cfg.cfg); + + + if options.add_kernel: + if not options.kernel_name: + raise exceptions.IOError("This action requires the -n option."); + else: + name = options.kernel_name; + if options.base_path: + base = options.base_path; + else: + base = my_cfg.cfg.get(my_cfg.config_name, 'base'); + my_modtool.import_kernel(name, base); + + if options.remove_kernel: + if not options.kernel_name: + raise exceptions.IOError("This action requires the -n option."); + else: + name = options.kernel_name; + my_modtool.remove_kernel(name); + + if options.add_all_kernels: + + if options.base_path: + base = options.base_path; + else: + base = my_cfg.cfg.get(my_cfg.config_name, 'base'); + kernelset = my_modtool.get_current_kernels(base); + for i in kernelset: + my_modtool.import_kernel(i, base); + + if options.remote_list: + if not options.base_path: + raise exceptions.IOError("This action requires the -b option. Try -l or -k for listing kernels in the base or the module.") + else: + base = options.base_path; + kernelset = my_modtool.get_current_kernels(base); + for i in kernelset: + print i; + + if options.list: + kernelset = my_modtool.get_current_kernels(); + for i in kernelset: + print i; + + if options.kernels: + dest = my_cfg.cfg.get(my_cfg.config_name, 'destination'); + name = my_cfg.cfg.get(my_cfg.config_name, 'name'); + base = os.path.join(dest, 'volk_gnsssdr_' + name); + kernelset = my_modtool.get_current_kernels(base); + for i in kernelset: + print i; diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py new file mode 100644 index 000000000..a613a2171 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.py @@ -0,0 +1,330 @@ +# +# Copyright 2013 Free Software Foundation, Inc. +# +# This file is part of GNU Radio +# +# GNU Radio is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# GNU Radio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Radio; see the file COPYING. If not, write to +# the Free Software Foundation, Inc., 51 Franklin Street, +# Boston, MA 02110-1301, USA. +# + +import os +import glob +import sys +import re +import glob +import shutil +import exceptions +from sets import Set + +class volk_gnsssdr_modtool: + def __init__(self, cfg): + self.volk_gnsssdr = re.compile('volk_gnsssdr'); + self.remove_after_underscore = re.compile("_.*"); + self.volk_gnsssdr_run_tests = re.compile('^\s*VOLK_RUN_TESTS.*\n', re.MULTILINE); + self.volk_gnsssdr_profile = re.compile('^\s*(VOLK_PROFILE|VOLK_PUPPET_PROFILE).*\n', re.MULTILINE); + self.my_dict = cfg; + self.lastline = re.compile('\s*char path\[1024\];.*'); + self.badassert = re.compile('^\s*assert\(toked\[0\] == "volk_gnsssdr_.*\n', re.MULTILINE); + self.goodassert = ' assert(toked[0] == "volk_gnsssdr");\n' + self.baderase = re.compile('^\s*toked.erase\(toked.begin\(\)\);.*\n', re.MULTILINE); + self.gooderase = ' toked.erase(toked.begin());\n toked.erase(toked.begin());\n'; + + def get_basename(self, base=None): + if not base: + base = self.my_dict['base'] + candidate = base.split('/')[-1]; + if len(candidate.split('_')) == 1: + return ''; + else: + return candidate.split('_')[-1]; + + def get_current_kernels(self, base=None): + if not base: + base = self.my_dict['base'] + name = self.get_basename(); + else: + name = self.get_basename(base); + if name == '': + hdr_files = glob.glob(os.path.join(base, "kernels/volk_gnsssdr/*.h")); + begins = re.compile("(?<=volk_gnsssdr_).*") + else: + hdr_files = glob.glob(os.path.join(base, "kernels/volk_gnsssdr_" + name + "/*.h")); + begins = re.compile("(?<=volk_gnsssdr_" + name + "_).*") + + datatypes = []; + functions = []; + + + for line in hdr_files: + + subline = re.search(".*\.h.*", os.path.basename(line)) + if subline: + subsubline = begins.search(subline.group(0)); + if subsubline: + dtype = self.remove_after_underscore.sub("", subsubline.group(0)); + subdtype = re.search("[0-9]+[A-z]+", dtype); + if subdtype: + datatypes.append(subdtype.group(0)); + + + datatypes = set(datatypes); + + for line in hdr_files: + for dt in datatypes: + if dt in line: + #subline = re.search("(?<=volk_gnsssdr_)" + dt + ".*(?=\.h)", line); + subline = re.search(begins.pattern[:-2] + dt + ".*(?=\.h)", line); + if subline: + functions.append(subline.group(0)); + + return set(functions); + + def make_module_skeleton(self): + + dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name']) + if os.path.exists(dest): + raise exceptions.IOError("Destination %s already exits!"%(dest)); + + if not os.path.exists(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'kernels/volk_gnsssdr_' + self.my_dict['name'])): + os.makedirs(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'kernels/volk_gnsssdr_' + self.my_dict['name'])) + + current_kernel_names = self.get_current_kernels(); + + for root, dirnames, filenames in os.walk(self.my_dict['base']): + for name in filenames: + t_table = map(lambda a: re.search(a, name), current_kernel_names); + t_table = set(t_table); + if t_table == set([None]): + infile = os.path.join(root, name); + instring = open(infile, 'r').read(); + outstring = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring); + newname = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], name); + relpath = os.path.relpath(infile, self.my_dict['base']); + newrelpath = re.sub(self.volk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath); + dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname); + + if not os.path.exists(os.path.dirname(dest)): + os.makedirs(os.path.dirname(dest)) + open(dest, 'w+').write(outstring); + + + infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc'); + instring = open(infile, 'r').read(); + outstring = re.sub(self.volk_gnsssdr_run_tests, '', instring); + open(infile, 'w+').write(outstring); + + infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc'); + instring = open(infile, 'r').read(); + outstring = re.sub(self.volk_gnsssdr_profile, '', instring); + open(infile, 'w+').write(outstring); + + infile = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/qa_utils.cc'); + instring = open(infile, 'r').read(); + outstring = re.sub(self.badassert, self.goodassert, instring); + outstring = re.sub(self.baderase, self.gooderase, outstring); + open(infile, 'w+').write(outstring); + + def write_default_cfg(self, cfg): + outfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'volk_gnsssdr_modtool.cfg'), 'wb'); + cfg.write(outfile); + outfile.close(); + + + def convert_kernel(self, oldvolk_gnsssdr, name, base, inpath, top): + infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + name + '.h'); + instring = open(infile, 'r').read(); + outstring = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring); + newname = 'volk_gnsssdr_' + self.my_dict['name'] + '_' + name + '.h'; + relpath = os.path.relpath(infile, base); + newrelpath = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath); + dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname); + + if not os.path.exists(os.path.dirname(dest)): + os.makedirs(os.path.dirname(dest)) + open(dest, 'w+').write(outstring); + + # copy orc proto-kernels if they exist + for orcfile in glob.glob(inpath + '/orc/' + top + name + '*.orc'): + if os.path.isfile(orcfile): + instring = open(orcfile, 'r').read(); + outstring = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], instring); + newname = 'volk_gnsssdr_' + self.my_dict['name'] + '_' + name + '.orc'; + relpath = os.path.relpath(orcfile, base); + newrelpath = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], relpath); + dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], os.path.dirname(newrelpath), newname); + if not os.path.exists(os.path.dirname(dest)): + os.makedirs(os.path.dirname(dest)); + open(dest, 'w+').write(outstring) + + + def remove_kernel(self, name): + basename = self.my_dict['name']; + if len(basename) > 0: + top = 'volk_gnsssdr_' + basename + '_'; + else: + top = 'volk_gnsssdr_' + base = os.path.join(self.my_dict['destination'], top[:-1]) ; + + if not name in self.get_current_kernels(): + + raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base)); + + + + inpath = os.path.abspath(base); + + + kernel = re.compile(name) + search_kernels = Set([kernel]) + profile = re.compile('^\s*VOLK_PROFILE') + puppet = re.compile('^\s*VOLK_PUPPET') + src_dest = os.path.join(inpath, 'apps/', top[:-1] + '_profile.cc'); + infile = open(src_dest); + otherlines = infile.readlines(); + open(src_dest, 'w+').write(''); + + for otherline in otherlines: + write_okay = True; + if kernel.search(otherline): + write_okay = False; + if puppet.match(otherline): + args = re.search("(?<=VOLK_PUPPET_PROFILE).*", otherline) + m_func = args.group(0).split(',')[0]; + func = re.search('(?<=' + top + ').*', m_func); + search_kernels.add(re.compile(func.group(0))); + if write_okay: + open(src_dest, 'a').write(otherline); + + + src_dest = os.path.join(inpath, 'lib/testqa.cc') + infile = open(src_dest); + otherlines = infile.readlines(); + open(src_dest, 'w+').write(''); + + for otherline in otherlines: + write_okay = True; + + for kernel in search_kernels: + if kernel.search(otherline): + write_okay = False; + + if write_okay: + open(src_dest, 'a').write(otherline); + + for kernel in search_kernels: + infile = os.path.join(inpath, 'kernels/' + top[:-1] + '/' + top + kernel.pattern + '.h'); + print "Removing kernel %s"%(kernel.pattern) + if os.path.exists(infile): + os.remove(infile); + # remove the orc proto-kernels if they exist. There are no puppets here + # so just need to glob for files matching kernel name + print glob.glob(inpath + '/orc/' + top + name + '*.orc'); + for orcfile in glob.glob(inpath + '/orc/' + top + name + '*.orc'): + print orcfile + if(os.path.exists(orcfile)): + os.remove(orcfile); + + def import_kernel(self, name, base): + if not (base): + base = self.my_dict['base']; + basename = self.getbasename(); + else: + basename = self.get_basename(base); + if not name in self.get_current_kernels(base): + raise exceptions.IOError("Requested kernel %s is not in module %s"%(name,base)); + + inpath = os.path.abspath(base); + if len(basename) > 0: + top = 'volk_gnsssdr_' + basename + '_'; + else: + top = 'volk_gnsssdr_' + oldvolk_gnsssdr = re.compile(top[:-1]); + + self.convert_kernel(oldvolk_gnsssdr, name, base, inpath, top); + + kernel = re.compile(name) + search_kernels = Set([kernel]) + + profile = re.compile('^\s*VOLK_PROFILE') + puppet = re.compile('^\s*VOLK_PUPPET') + infile = open(os.path.join(inpath, 'apps/', oldvolk_gnsssdr.pattern + '_profile.cc')); + otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc')); + dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'apps/volk_gnsssdr_' + self.my_dict['name'] + '_profile.cc'); + lines = infile.readlines(); + otherlines = otherinfile.readlines(); + open(dest, 'w+').write(''); + insert = False; + inserted = False + for otherline in otherlines: + + if self.lastline.match(otherline): + insert = True; + if insert and not inserted: + inserted = True; + for line in lines: + if kernel.search(line): + if profile.match(line): + outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line); + open(dest, 'a').write(outline); + elif puppet.match(line): + outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line); + open(dest, 'a').write(outline); + args = re.search("(?<=VOLK_PUPPET_PROFILE).*", line) + m_func = args.group(0).split(',')[0]; + func = re.search('(?<=' + top + ').*', m_func); + search_kernels.add(re.compile(func.group(0))); + self.convert_kernel(oldvolk_gnsssdr, func.group(0), base, inpath, top); + write_okay = True; + for kernel in search_kernels: + if kernel.search(otherline): + write_okay = False + if write_okay: + open(dest, 'a').write(otherline); + + for kernel in search_kernels: + print "Adding kernel %s from module %s"%(kernel.pattern,base) + + infile = open(os.path.join(inpath, 'lib/testqa.cc')); + otherinfile = open(os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc')); + dest = os.path.join(self.my_dict['destination'], 'volk_gnsssdr_' + self.my_dict['name'], 'lib/testqa.cc'); + lines = infile.readlines(); + otherlines = otherinfile.readlines(); + open(dest, 'w+').write(''); + inserted = False; + insert = False + for otherline in otherlines: + + if (re.match('\s*', otherline) == None or re.match('\s*#.*', otherline) == None): + + insert = True; + if insert and not inserted: + inserted = True; + for line in lines: + for kernel in search_kernels: + if kernel.search(line): + if self.volk_gnsssdr_run_tests.match(line): + outline = re.sub(oldvolk_gnsssdr, 'volk_gnsssdr_' + self.my_dict['name'], line); + open(dest, 'a').write(outline); + write_okay = True; + for kernel in search_kernels: + if kernel.search(otherline): + write_okay = False + if write_okay: + open(dest, 'a').write(otherline); + + + + + diff --git a/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc new file mode 100644 index 000000000..67cee0681 Binary files /dev/null and b/src/algorithms/libs/volk_gnsssdr/python/volk_gnsssdr_modtool/volk_gnsssdr_modtool_generate.pyc differ diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c new file mode 100644 index 000000000..53dfaa97b --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c @@ -0,0 +1,212 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include +#include "volk_gnsssdr_machines.h" +#include +#include +#include "volk_gnsssdr_rank_archs.h" +#include +#include +#include +#include + +static size_t __alignment = 0; +static intptr_t __alignment_mask = 0; + +struct volk_gnsssdr_machine *get_machine(void) +{ + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; + static struct volk_gnsssdr_machine *machine = NULL; + + if(machine != NULL) + return machine; + else { + unsigned int max_score = 0; + unsigned int i; + struct volk_gnsssdr_machine *max_machine = NULL; + for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { + if(volk_gnsssdr_machines[i]->caps > max_score) { + max_score = volk_gnsssdr_machines[i]->caps; + max_machine = volk_gnsssdr_machines[i]; + } + } + } + machine = max_machine; + printf("Using Volk machine: %s\n", machine->name); + __alignment = machine->alignment; + __alignment_mask = (intptr_t)(__alignment-1); + return machine; + } +} + +void volk_gnsssdr_list_machines(void) +{ + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; + + unsigned int i; + for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { + printf("%s;", volk_gnsssdr_machines[i]->name); + } + } + printf("\n"); +} + +const char* volk_gnsssdr_get_machine(void) +{ + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; + static struct volk_gnsssdr_machine *machine = NULL; + + if(machine != NULL) + return machine->name; + else { + unsigned int max_score = 0; + unsigned int i; + struct volk_gnsssdr_machine *max_machine = NULL; + for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { + if(volk_gnsssdr_machines[i]->caps > max_score) { + max_score = volk_gnsssdr_machines[i]->caps; + max_machine = volk_gnsssdr_machines[i]; + } + } + } + machine = max_machine; + return machine->name; + } +} + +size_t volk_gnsssdr_get_alignment(void) +{ + get_machine(); //ensures alignment is set + return __alignment; +} + +bool volk_gnsssdr_is_aligned(const void *ptr) +{ + return ((intptr_t)(ptr) & __alignment_mask) == 0; +} + +#define LV_HAVE_GENERIC +#define LV_HAVE_DISPATCHER + +#for $kern in $kernels + +#if $kern.has_dispatcher +#include //pulls in the dispatcher +#end if + +static inline void __$(kern.name)_d($kern.arglist_full) +{ + #if $kern.has_dispatcher + $(kern.name)_dispatcher($kern.arglist_names); + return; + #end if + + if (volk_gnsssdr_is_aligned( + #set $num_open_parens = 0 + #for $arg_type, $arg_name in $kern.args + #if '*' in $arg_type + VOLK_OR_PTR($arg_name, + #set $num_open_parens += 1 + #end if + #end for + 0$(')'*$num_open_parens) + )){ + $(kern.name)_a($kern.arglist_names); + } + else{ + $(kern.name)_u($kern.arglist_names); + } +} + +static inline void __init_$(kern.name)(void) +{ + const char *name = get_machine()->$(kern.name)_name; + const char **impl_names = get_machine()->$(kern.name)_impl_names; + const int *impl_deps = get_machine()->$(kern.name)_impl_deps; + const bool *alignment = get_machine()->$(kern.name)_impl_alignment; + const size_t n_impls = get_machine()->$(kern.name)_n_impls; + const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/); + const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/); + $(kern.name)_a = get_machine()->$(kern.name)_impls[index_a]; + $(kern.name)_u = get_machine()->$(kern.name)_impls[index_u]; + + assert($(kern.name)_a); + assert($(kern.name)_u); + + $(kern.name) = &__$(kern.name)_d; +} + +static inline void __$(kern.name)_a($kern.arglist_full) +{ + __init_$(kern.name)(); + $(kern.name)_a($kern.arglist_names); +} + +static inline void __$(kern.name)_u($kern.arglist_full) +{ + __init_$(kern.name)(); + $(kern.name)_u($kern.arglist_names); +} + +static inline void __$(kern.name)($kern.arglist_full) +{ + __init_$(kern.name)(); + $(kern.name)($kern.arglist_names); +} + +$kern.pname $(kern.name)_a = &__$(kern.name)_a; +$kern.pname $(kern.name)_u = &__$(kern.name)_u; +$kern.pname $(kern.name) = &__$(kern.name); + +void $(kern.name)_manual($kern.arglist_full, const char* impl_name) +{ + const int index = volk_gnsssdr_get_index( + get_machine()->$(kern.name)_impl_names, + get_machine()->$(kern.name)_n_impls, + impl_name + ); + get_machine()->$(kern.name)_impls[index]( + $kern.arglist_names + ); +} + +volk_gnsssdr_func_desc_t $(kern.name)_get_func_desc(void) { + const char **impl_names = get_machine()->$(kern.name)_impl_names; + const int *impl_deps = get_machine()->$(kern.name)_impl_deps; + const bool *alignment = get_machine()->$(kern.name)_impl_alignment; + const size_t n_impls = get_machine()->$(kern.name)_n_impls; + volk_gnsssdr_func_desc_t desc = { + impl_names, + impl_deps, + alignment, + n_impls + }; + return desc; +} + +#end for diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h new file mode 100644 index 000000000..29f16a8c0 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h @@ -0,0 +1,94 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_GNSSSDR_RUNTIME +#define INCLUDED_VOLK_GNSSSDR_RUNTIME + +#include +#include +#include +#include +#include + +#include +#include + +__VOLK_DECL_BEGIN + +typedef struct volk_gnsssdr_func_desc +{ + const char **impl_names; + const int *impl_deps; + const bool *impl_alignment; + const size_t n_impls; +} volk_gnsssdr_func_desc_t; + +//! Prints a list of machines available +VOLK_API void volk_gnsssdr_list_machines(void); + +//! Returns the name of the machine this instance will use +VOLK_API const char* volk_gnsssdr_get_machine(void); + +//! Get the machine alignment in bytes +VOLK_API size_t volk_gnsssdr_get_alignment(void); + +/*! + * The VOLK_OR_PTR macro is a convenience macro + * for checking the alignment of a set of pointers. + * Example usage: + * volk_gnsssdr_is_aligned(VOLK_OR_PTR((VOLK_OR_PTR(p0, p1), p2))) + */ +#define VOLK_OR_PTR(ptr0, ptr1) \ + (const void *)(((intptr_t)(ptr0)) | ((intptr_t)(ptr1))) + +/*! + * Is the pointer on a machine alignment boundary? + * + * Note: for performance reasons, this function + * is not usable until another volk_gnsssdr API call is made + * which will perform certain initialization tasks. + * + * \param ptr the pointer to some memory buffer + * \return 1 for alignment boundary, else 0 + */ +VOLK_API bool volk_gnsssdr_is_aligned(const void *ptr); + +#for $kern in $kernels + +//! A function pointer to the dispatcher implementation +extern VOLK_API $kern.pname $kern.name; + +//! A function pointer to the fastest aligned implementation +extern VOLK_API $kern.pname $(kern.name)_a; + +//! A function pointer to the fastest unaligned implementation +extern VOLK_API $kern.pname $(kern.name)_u; + +//! Call into a specific implementation given by name +extern VOLK_API void $(kern.name)_manual($kern.arglist_full, const char* impl_name); + +//! Get description paramaters for this kernel +extern VOLK_API volk_gnsssdr_func_desc_t $(kern.name)_get_func_desc(void); +#end for + +__VOLK_DECL_END + +#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/ diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h new file mode 100644 index 000000000..15e2191b7 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h @@ -0,0 +1,29 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H +#define INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED_H + +#for $i, $arch in enumerate($archs) +#define LV_$(arch.name.upper()) $i +#end for + +#endif /*INCLUDED_VOLK_GNSSSDR_CONFIG_FIXED*/ diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c new file mode 100644 index 000000000..cc58d9ebf --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c @@ -0,0 +1,191 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include +#include +#include + +struct VOLK_CPU volk_gnsssdr_cpu; + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) + #define VOLK_CPU_x86 +#endif + +#if defined(VOLK_CPU_x86) + +//implement get cpuid for gcc compilers using a system or local copy of cpuid.h +#if defined(__GNUC__) + #if defined(HAVE_CPUID_H) + #include + #else + #include "gcc_x86_cpuid.h" + #endif + #define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3) + + /* Return Intel AVX extended CPU capabilities register. + * This function will bomb on non-AVX-capable machines, so + * check for AVX capability before executing. + */ + #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV) + static inline unsigned long long _xgetbv(unsigned int index){ + unsigned int eax, edx; + __asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); + return ((unsigned long long)edx << 32) | eax; + } + #define __xgetbv() _xgetbv(0) + #else + #define __xgetbv() 0 + #endif + +//implement get cpuid for MSVC compilers using __cpuid intrinsic +#elif defined(_MSC_VER) && defined(HAVE_INTRIN_H) + #include + #define cpuid_x86(op, r) __cpuid(((int*)r), op) + + #if defined(_XCR_XFEATURE_ENABLED_MASK) + #define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) + #else + #define __xgetbv() 0 + #endif + +#else + #error "A get cpuid for volk_gnsssdr is not available on this compiler..." +#endif //defined(__GNUC__) + +#endif //defined(VOLK_CPU_x86) + +static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) { +#if defined(VOLK_CPU_x86) + unsigned int regs[4]; + cpuid_x86(op, regs); + return regs[reg] >> bit & 0x01; +#else + return 0; +#endif +} + +static inline unsigned int check_extended_cpuid(unsigned int val) { +#if defined(VOLK_CPU_x86) + unsigned int regs[4]; + cpuid_x86(0x80000000, regs); + return regs[0] >= val; +#else + return 0; +#endif +} + +static inline unsigned int get_avx_enabled(void) { +#if defined(VOLK_CPU_x86) + return __xgetbv() & 0x6; +#else + return 0; +#endif +} + +//neon detection is linux specific +#if defined(__arm__) && defined(__linux__) + #include + #include + #include + #define VOLK_CPU_ARM +#endif + +static int has_neon(void){ +#if defined(VOLK_CPU_ARM) + FILE *auxvec_f; + unsigned long auxvec[2]; + unsigned int found_neon = 0; + auxvec_f = fopen("/proc/self/auxv", "rb"); + if(!auxvec_f) return 0; + + size_t r = 1; + //so auxv is basically 32b of ID and 32b of value + //so it goes like this + while(!found_neon && r) { + r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f); + if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) + found_neon = 1; + } + + fclose(auxvec_f); + return found_neon; +#else + return 0; +#endif +} + +static int has_ppc(void){ +#ifdef __PPC__ + return 1; +#else + return 0; +#endif +} + +#for $arch in $archs +static int i_can_has_$arch.name (void) { + #for $check, $params in $arch.checks + if ($(check)($(', '.join($params))) == 0) return 0; + #end for + return 1; +} + +#end for + +#if defined(HAVE_FENV_H) + #if defined(FE_TONEAREST) + #include + static inline void set_float_rounding(void){ + fesetround(FE_TONEAREST); + } + #else + static inline void set_float_rounding(void){ + //do nothing + } + #endif +#elif defined(_MSC_VER) + #include + static inline void set_float_rounding(void){ + unsigned int cwrd; + _controlfp_s(&cwrd, 0, 0); + _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC); + } +#else + static inline void set_float_rounding(void){ + //do nothing + } +#endif + +void volk_gnsssdr_cpu_init() { + #for $arch in $archs + volk_gnsssdr_cpu.has_$arch.name = &i_can_has_$arch.name; + #end for + set_float_rounding(); +} + +unsigned int volk_gnsssdr_get_lvarch() { + unsigned int retval = 0; + volk_gnsssdr_cpu_init(); + #for $arch in $archs + retval += volk_gnsssdr_cpu.has_$(arch.name)() << LV_$(arch.name.upper()); + #end for + return retval; +} diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h new file mode 100644 index 000000000..f58bd21d6 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h @@ -0,0 +1,42 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_GNSSSDR_CPU_H +#define INCLUDED_VOLK_GNSSSDR_CPU_H + +#include + +__VOLK_DECL_BEGIN + +struct VOLK_CPU { + #for $arch in $archs + int (*has_$arch.name) (); + #end for +}; + +extern struct VOLK_CPU volk_gnsssdr_cpu; + +void volk_gnsssdr_cpu_init (); +unsigned int volk_gnsssdr_get_lvarch (); + +__VOLK_DECL_END + +#endif /*INCLUDED_VOLK_GNSSSDR_CPU_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c new file mode 100644 index 000000000..36b61da4f --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c @@ -0,0 +1,79 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#set $this_machine = $machine_dict[$args[0]] +#set $arch_names = $this_machine.arch_names + +#for $arch in $this_machine.archs +#define LV_HAVE_$(arch.name.upper()) 1 +#end for + +#include +#include "volk_gnsssdr_machines.h" +#include + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#for $kern in $kernels +#include +#end for + +######################################################################## +#def make_arch_have_list($archs) +$(' | '.join(['(1 << LV_%s)'%a.name.upper() for a in $archs]))#slurp +#end def + +######################################################################## +#def make_impl_name_list($impls) +{$(', '.join(['"%s"'%i.name for i in $impls]))}#slurp +#end def + +######################################################################## +#def make_impl_align_list($impls) +{$(', '.join(['true' if i.is_aligned else 'false' for i in $impls]))}#slurp +#end def + +######################################################################## +#def make_impl_deps_list($impls) +{$(', '.join([' | '.join(['(1 << LV_%s)'%d.upper() for d in i.deps]) for i in $impls]))}#slurp +#end def + +######################################################################## +#def make_impl_fcn_list($name, $impls) +{$(', '.join(['%s_%s'%($name, i.name) for i in $impls]))}#slurp +#end def + +struct volk_gnsssdr_machine volk_gnsssdr_machine_$(this_machine.name) = { + $make_arch_have_list($this_machine.archs), + "$this_machine.name", + $this_machine.alignment, + #for $kern in $kernels + #set $impls = $kern.get_impls($arch_names) + "$kern.name", ##//kernel name + $make_impl_name_list($impls), ##//list of kernel implementations by name + $make_impl_deps_list($impls), ##//list of arch dependencies per implementation + $make_impl_align_list($impls), ##//alignment required? for each implementation + $make_impl_fcn_list($kern.name, $impls), ##//pointer to each implementation + $(len($impls)), ##//number of implementations listed here + #end for +}; diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c new file mode 100644 index 000000000..64e436010 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c @@ -0,0 +1,34 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#include +#include +#include "volk_gnsssdr_machines.h" + +struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = { +#for $machine in $machines +#ifdef LV_MACHINE_$(machine.name.upper()) +&volk_gnsssdr_machine_$(machine.name), +#endif +#end for +}; + +unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines)/sizeof(*volk_gnsssdr_machines); diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h new file mode 100644 index 000000000..32db767b9 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h @@ -0,0 +1,55 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +#define INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H + +#include +#include + +#include +#include + +__VOLK_DECL_BEGIN + +struct volk_gnsssdr_machine { + const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format) + const char *name; + const size_t alignment; //the maximum byte alignment required for functions in this library + #for $kern in $kernels + const char *$(kern.name)_name; + const char *$(kern.name)_impl_names[$(len($archs))]; + const int $(kern.name)_impl_deps[$(len($archs))]; + const bool $(kern.name)_impl_alignment[$(len($archs))]; + const $(kern.pname) $(kern.name)_impls[$(len($archs))]; + const size_t $(kern.name)_n_impls; + #end for +}; + +#for $machine in $machines +#ifdef LV_MACHINE_$(machine.name.upper()) +extern struct volk_gnsssdr_machine volk_gnsssdr_machine_$(machine.name); +#endif +#end for + +__VOLK_DECL_END + +#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H diff --git a/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h new file mode 100644 index 000000000..a40d71764 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h @@ -0,0 +1,32 @@ +/* + * Copyright 2011-2012 Free Software Foundation, Inc. + * + * This file is part of GNU Radio + * + * GNU Radio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU Radio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Radio; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, + * Boston, MA 02110-1301, USA. + */ + +#ifndef INCLUDED_VOLK_GNSSSDR_TYPEDEFS +#define INCLUDED_VOLK_GNSSSDR_TYPEDEFS + +#include +#include + +#for $kern in $kernels +typedef void (*$(kern.pname))($kern.arglist_types); +#end for + +#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/ diff --git a/src/algorithms/libs/volk_gnsssdr/volk_gnsssdr.pc.in b/src/algorithms/libs/volk_gnsssdr/volk_gnsssdr.pc.in new file mode 100644 index 000000000..bc2c2e425 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/volk_gnsssdr.pc.in @@ -0,0 +1,14 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ +LV_CXXFLAGS=@LV_CXXFLAGS@ + + +Name: volk_gnsssdr +Description: VOLK: Vector Optimized Library of Kernels +Requires: +Version: @VERSION@ +Libs: -L${libdir} -lvolk_gnsssdr +Cflags: -I${includedir} ${LV_CXXFLAGS} + diff --git a/src/algorithms/libs/volk_gnsssdr/volk_modtool.cfg b/src/algorithms/libs/volk_gnsssdr/volk_modtool.cfg new file mode 100644 index 000000000..c47ac2444 --- /dev/null +++ b/src/algorithms/libs/volk_gnsssdr/volk_modtool.cfg @@ -0,0 +1,5 @@ +[config] +name = gnsssdr +destination = /Users/andres/Github/gnss-sdr/src/algorithms/libs +base = /Users/andres/github/gnuradio/volk + diff --git a/src/algorithms/tracking/adapters/CMakeLists.txt b/src/algorithms/tracking/adapters/CMakeLists.txt index 1937a9c5a..be90957d6 100644 --- a/src/algorithms/tracking/adapters/CMakeLists.txt +++ b/src/algorithms/tracking/adapters/CMakeLists.txt @@ -19,6 +19,7 @@ set(TRACKING_ADAPTER_SOURCES galileo_e1_dll_pll_veml_tracking.cc + galileo_volk_e1_dll_pll_veml_tracking.cc galileo_e1_tcp_connector_tracking.cc gps_l1_ca_dll_fll_pll_tracking.cc gps_l1_ca_dll_pll_optim_tracking.cc diff --git a/src/algorithms/tracking/adapters/galileo_e1_dll_pll_veml_tracking.cc b/src/algorithms/tracking/adapters/galileo_e1_dll_pll_veml_tracking.cc old mode 100644 new mode 100755 diff --git a/src/algorithms/tracking/adapters/galileo_e1_dll_pll_veml_tracking.h b/src/algorithms/tracking/adapters/galileo_e1_dll_pll_veml_tracking.h old mode 100644 new mode 100755 diff --git a/src/algorithms/tracking/adapters/galileo_volk_e1_dll_pll_veml_tracking.cc b/src/algorithms/tracking/adapters/galileo_volk_e1_dll_pll_veml_tracking.cc new file mode 100644 index 000000000..20efd2b60 --- /dev/null +++ b/src/algorithms/tracking/adapters/galileo_volk_e1_dll_pll_veml_tracking.cc @@ -0,0 +1,158 @@ +/*! + * \file galileo_volk_e1_dll_pll_veml_tracking.cc + * \brief Adapts a DLL+PLL VEML (Very Early Minus Late) tracking loop block + * to a TrackingInterface for Galileo E1 signals + * \author Luis Esteve, 2012. luis(at)epsilon-formacion.com + * + * Code DLL + carrier PLL according to the algorithms described in: + * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * A Software-Defined GPS and Galileo Receiver. A Single-Frequency + * Approach, Birkhauser, 2007 + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#include "galileo_volk_e1_dll_pll_veml_tracking.h" +#include +#include "GPS_L1_CA.h" +#include "Galileo_E1.h" +#include "configuration_interface.h" + + +using google::LogMessage; + +GalileoVolkE1DllPllVemlTracking::GalileoVolkE1DllPllVemlTracking( + ConfigurationInterface* configuration, std::string role, + unsigned int in_streams, unsigned int out_streams, + boost::shared_ptr queue) : + role_(role), in_streams_(in_streams), out_streams_(out_streams), + queue_(queue) +{ + DLOG(INFO) << "role " << role; + //################# CONFIGURATION PARAMETERS ######################## + int fs_in; + int vector_length; + int f_if; + bool dump; + std::string dump_filename; + std::string item_type; + std::string default_item_type = "gr_complex"; + float pll_bw_hz; + float dll_bw_hz; + float early_late_space_chips; + float very_early_late_space_chips; + + item_type = configuration->property(role + ".item_type",default_item_type); + fs_in = configuration->property("GNSS-SDR.internal_fs_hz", 2048000); + f_if = configuration->property(role + ".if", 0); + dump = configuration->property(role + ".dump", false); + pll_bw_hz = configuration->property(role + ".pll_bw_hz", 50.0); + dll_bw_hz = configuration->property(role + ".dll_bw_hz", 2.0); + early_late_space_chips = configuration->property(role + ".early_late_space_chips", 0.15); + very_early_late_space_chips = configuration->property(role + ".very_early_late_space_chips", 0.6); + + std::string default_dump_filename = "./track_ch"; + dump_filename = configuration->property(role + ".dump_filename", + default_dump_filename); //unused! + vector_length = std::round(fs_in / (Galileo_E1_CODE_CHIP_RATE_HZ / Galileo_E1_B_CODE_LENGTH_CHIPS)); + + //################# MAKE TRACKING GNURadio object ################### + if (item_type.compare("gr_complex") == 0) + { + item_size_ = sizeof(gr_complex); + tracking_ = galileo_volk_e1_dll_pll_veml_make_tracking_cc( + f_if, + fs_in, + vector_length, + queue_, + dump, + dump_filename, + pll_bw_hz, + dll_bw_hz, + early_late_space_chips, + very_early_late_space_chips); + } + else + { + LOG(WARNING) << item_type << " unknown tracking item type."; + } + + DLOG(INFO) << "tracking(" << tracking_->unique_id() << ")"; +} + +GalileoVolkE1DllPllVemlTracking::~GalileoVolkE1DllPllVemlTracking() +{} + +void GalileoVolkE1DllPllVemlTracking::start_tracking() +{ + tracking_->start_tracking(); +} + +/* + * Set tracking channel unique ID + */ +void GalileoVolkE1DllPllVemlTracking::set_channel(unsigned int channel) +{ + channel_ = channel; + tracking_->set_channel(channel); +} + +/* + * Set tracking channel internal queue + */ +void GalileoVolkE1DllPllVemlTracking::set_channel_queue( + concurrent_queue *channel_internal_queue) +{ + channel_internal_queue_ = channel_internal_queue; + + tracking_->set_channel_queue(channel_internal_queue_); + +} + +void GalileoVolkE1DllPllVemlTracking::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro) +{ + tracking_->set_gnss_synchro(p_gnss_synchro); +} + +void GalileoVolkE1DllPllVemlTracking::connect(gr::top_block_sptr top_block) +{ + //nothing to connect, now the tracking uses gr_sync_decimator +} + +void GalileoVolkE1DllPllVemlTracking::disconnect(gr::top_block_sptr top_block) +{ + //nothing to disconnect, now the tracking uses gr_sync_decimator +} + +gr::basic_block_sptr GalileoVolkE1DllPllVemlTracking::get_left_block() +{ + return tracking_; +} + +gr::basic_block_sptr GalileoVolkE1DllPllVemlTracking::get_right_block() +{ + return tracking_; +} + diff --git a/src/algorithms/tracking/adapters/galileo_volk_e1_dll_pll_veml_tracking.h b/src/algorithms/tracking/adapters/galileo_volk_e1_dll_pll_veml_tracking.h new file mode 100644 index 000000000..ef397ef45 --- /dev/null +++ b/src/algorithms/tracking/adapters/galileo_volk_e1_dll_pll_veml_tracking.h @@ -0,0 +1,119 @@ +/*! + * \file galileo_volk_e1_dll_pll_veml_tracking.h + * \brief Adapts a DLL+PLL VEML (Very Early Minus Late) tracking loop block + * to a TrackingInterface for Galileo E1 signals + * \author Luis Esteve, 2012. luis(at)epsilon-formacion.com + * + * Code DLL + carrier PLL according to the algorithms described in: + * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * A Software-Defined GPS and Galileo Receiver. A Single-Frequency + * Approach, Birkha user, 2007 + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef GNSS_SDR_GALILEO_VOLK_E1_DLL_PLL_VEML_TRACKING_H_ +#define GNSS_SDR_GALILEO_VOLK_E1_DLL_PLL_VEML_TRACKING_H_ + +#include +#include +#include "tracking_interface.h" +#include "galileo_volk_e1_dll_pll_veml_tracking_cc.h" + + +class ConfigurationInterface; + +/*! + * \brief This class Adapts a DLL+PLL VEML (Very Early Minus Late) tracking + * loop block to a TrackingInterface for Galileo E1 signals + */ +class GalileoVolkE1DllPllVemlTracking : public TrackingInterface +{ + +public: + + GalileoVolkE1DllPllVemlTracking(ConfigurationInterface* configuration, + std::string role, + unsigned int in_streams, + unsigned int out_streams, + boost::shared_ptr queue); + + virtual ~GalileoVolkE1DllPllVemlTracking(); + + std::string role() + { + return role_; + } + + //! Returns "galileo_volk_e1_dll_pll_veml_tracking" + std::string implementation() + { + return "galileo_volk_e1_dll_pll_veml_tracking"; + } + size_t item_size() + { + return item_size_; + } + + void connect(gr::top_block_sptr top_block); + void disconnect(gr::top_block_sptr top_block); + gr::basic_block_sptr get_left_block(); + gr::basic_block_sptr get_right_block(); + + + /*! + * \brief Set tracking channel unique ID + */ + void set_channel(unsigned int channel); + + /*! + * \brief Set acquisition/tracking common Gnss_Synchro object pointer + * to efficiently exchange synchronization data between acquisition and + * tracking blocks + */ + void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro); + + /*! + * \brief Set tracking channel internal queue + */ + void set_channel_queue(concurrent_queue *channel_internal_queue); + + void start_tracking(); + +private: + + galileo_volk_e1_dll_pll_veml_tracking_cc_sptr tracking_; + size_t item_size_; + + unsigned int channel_; + + std::string role_; + unsigned int in_streams_; + unsigned int out_streams_; + boost::shared_ptr queue_; + concurrent_queue *channel_internal_queue_; +}; + +#endif // GNSS_SDR_GALILEO_VOLK_E1_DLL_PLL_VEML_TRACKING_H_ diff --git a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt index fc6ba999e..67dbe19eb 100644 --- a/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt +++ b/src/algorithms/tracking/gnuradio_blocks/CMakeLists.txt @@ -18,6 +18,7 @@ set(TRACKING_GR_BLOCKS_SOURCES galileo_e1_dll_pll_veml_tracking_cc.cc + galileo_volk_e1_dll_pll_veml_tracking_cc.cc galileo_e1_tcp_connector_tracking_cc.cc gps_l1_ca_dll_fll_pll_tracking_cc.cc gps_l1_ca_dll_pll_optim_tracking_cc.cc @@ -37,6 +38,7 @@ include_directories( ${GFlags_INCLUDE_DIRS} ${Boost_INCLUDE_DIRS} ${GNURADIO_RUNTIME_INCLUDE_DIRS} + ${VOLK_GNSSSDR_INCLUDE_DIRS} ) if(ENABLE_GENERIC_ARCH) @@ -46,4 +48,4 @@ endif(ENABLE_GENERIC_ARCH) file(GLOB TRACKING_GR_BLOCKS_HEADERS "*.h") add_library(tracking_gr_blocks ${TRACKING_GR_BLOCKS_SOURCES} ${TRACKING_GR_BLOCKS_HEADERS}) source_group(Headers FILES ${TRACKING_GR_BLOCKS_HEADERS}) -target_link_libraries(tracking_gr_blocks tracking_lib ${GNURADIO_RUNTIME_LIBRARIES} gnss_sp_libs ${Boost_LIBRARIES} ) +target_link_libraries(tracking_gr_blocks tracking_lib ${GNURADIO_RUNTIME_LIBRARIES} gnss_sp_libs ${Boost_LIBRARIES} ${VOLK_GNSSSDR_LIBRARIES} ) diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.cc old mode 100644 new mode 100755 diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h b/src/algorithms/tracking/gnuradio_blocks/galileo_e1_dll_pll_veml_tracking_cc.h old mode 100644 new mode 100755 diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc b/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc new file mode 100644 index 000000000..258d5b023 --- /dev/null +++ b/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.cc @@ -0,0 +1,670 @@ +/*! + * \file galileo_volk_e1_dll_pll_veml_tracking_cc.cc + * \brief Implementation of a code DLL + carrier PLL VEML (Very Early + * Minus Late) tracking block for Galileo E1 signals + * \author Luis Esteve, 2012. luis(at)epsilon-formacion.com + * + * Code DLL + carrier PLL according to the algorithms described in: + * [1] K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * A Software-Defined GPS and Galileo Receiver. A Single-Frequency + * Approach, Birkhauser, 2007 + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#include "galileo_volk_e1_dll_pll_veml_tracking_cc.h" +#include +#include +#include +#include +#include +#include +#include +#include "gnss_synchro.h" +#include "galileo_e1_signal_processing.h" +#include "tracking_discriminators.h" +#include "lock_detectors.h" +#include "Galileo_E1.h" +#include "control_message_factory.h" +#include "volk_gnsssdr/volk_gnsssdr.h" + + + +/*! + * \todo Include in definition header file + */ +#define CN0_ESTIMATION_SAMPLES 20 +#define MINIMUM_VALID_CN0 25 +#define MAXIMUM_LOCK_FAIL_COUNTER 50 +#define CARRIER_LOCK_THRESHOLD 0.85 + + +using google::LogMessage; + +galileo_volk_e1_dll_pll_veml_tracking_cc_sptr +galileo_volk_e1_dll_pll_veml_make_tracking_cc( + long if_freq, + long fs_in, + unsigned int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips, + float very_early_late_space_chips) +{ + return galileo_volk_e1_dll_pll_veml_tracking_cc_sptr(new galileo_volk_e1_dll_pll_veml_tracking_cc(if_freq, + fs_in, vector_length, queue, dump, dump_filename, pll_bw_hz, dll_bw_hz, early_late_space_chips, very_early_late_space_chips)); +} + + +void galileo_volk_e1_dll_pll_veml_tracking_cc::forecast (int noutput_items, + gr_vector_int &ninput_items_required) +{ + ninput_items_required[0] = (int)d_vector_length*2; //set the required available samples in each call +} + + +galileo_volk_e1_dll_pll_veml_tracking_cc::galileo_volk_e1_dll_pll_veml_tracking_cc( + long if_freq, + long fs_in, + unsigned int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips, + float very_early_late_space_chips): + gr::block("galileo_volk_e1_dll_pll_veml_tracking_cc", gr::io_signature::make(1, 1, sizeof(gr_complex)), + gr::io_signature::make(1, 1, sizeof(Gnss_Synchro))) +{ + this->set_relative_rate(1.0/vector_length); + // initialize internal vars + d_queue = queue; + d_dump = dump; + d_if_freq = if_freq; + d_fs_in = fs_in; + d_vector_length = vector_length; + d_dump_filename = dump_filename; + d_code_loop_filter = Tracking_2nd_DLL_filter(Galileo_E1_CODE_PERIOD); + d_carrier_loop_filter = Tracking_2nd_PLL_filter(Galileo_E1_CODE_PERIOD); + + // Initialize tracking ========================================== + + // Set bandwidth of code and carrier loop filters + d_code_loop_filter.set_DLL_BW(dll_bw_hz); + d_carrier_loop_filter.set_PLL_BW(pll_bw_hz); + + // Correlator spacing + d_early_late_spc_chips = early_late_space_chips; // Define early-late offset (in chips) + d_very_early_late_spc_chips = very_early_late_space_chips; // Define very-early-late offset (in chips) + + // Initialization of local code replica + // Get space for a vector with the sinboc(1,1) replica sampled 2x/chip + d_ca_code = new gr_complex[(int)(2*Galileo_E1_B_CODE_LENGTH_CHIPS + 4)]; + + /* If an array is partitioned for more than one thread to operate on, + * having the sub-array boundaries unaligned to cache lines could lead + * to performance degradation. Here we allocate memory + * (gr_comlex array of size 2*d_vector_length) aligned to cache of 16 bytes + */ + + d_very_early_code=(gr_complex*)volk_malloc(2*d_vector_length * sizeof(gr_complex),volk_get_alignment()); + d_early_code=(gr_complex*)volk_malloc(2*d_vector_length * sizeof(gr_complex),volk_get_alignment()); + d_prompt_code=(gr_complex*)volk_malloc(2*d_vector_length * sizeof(gr_complex),volk_get_alignment()); + d_late_code=(gr_complex*)volk_malloc(2*d_vector_length * sizeof(gr_complex),volk_get_alignment()); + d_very_late_code=(gr_complex*)volk_malloc(2*d_vector_length * sizeof(gr_complex),volk_get_alignment()); + + d_carr_sign=(gr_complex*)volk_malloc(2*d_vector_length * sizeof(gr_complex),volk_get_alignment()); + + d_very_early_code16=(lv_16sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_16sc_t),volk_get_alignment()); + d_early_code16=(lv_16sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_16sc_t),volk_get_alignment()); + d_prompt_code16=(lv_16sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_16sc_t),volk_get_alignment()); + d_late_code16=(lv_16sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_16sc_t),volk_get_alignment()); + d_very_late_code16=(lv_16sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_16sc_t),volk_get_alignment()); + d_carr_sign16=(lv_16sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_16sc_t),volk_get_alignment()); + in16=(lv_16sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_16sc_t),volk_get_alignment()); + + d_very_early_code8=(lv_8sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_8sc_t),volk_get_alignment()); + d_early_code8=(lv_8sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_8sc_t),volk_get_alignment()); + d_prompt_code8=(lv_8sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_8sc_t),volk_get_alignment()); + d_late_code8=(lv_8sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_8sc_t),volk_get_alignment()); + d_very_late_code8=(lv_8sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_8sc_t),volk_get_alignment()); + d_carr_sign8=(lv_8sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_8sc_t),volk_get_alignment()); + in8=(lv_8sc_t*)volk_malloc(2*d_vector_length * sizeof(lv_8sc_t),volk_get_alignment()); + + // correlator outputs (scalar) + + d_Very_Early=(gr_complex*)volk_malloc(sizeof(gr_complex),volk_get_alignment()); + d_Early=(gr_complex*)volk_malloc(sizeof(gr_complex),volk_get_alignment()); + d_Prompt=(gr_complex*)volk_malloc(sizeof(gr_complex),volk_get_alignment()); + d_Late=(gr_complex*)volk_malloc(sizeof(gr_complex),volk_get_alignment()); + d_Very_Late=(gr_complex*)volk_malloc(sizeof(gr_complex),volk_get_alignment()); + + //--- Initializations ------------------------------ + // Initial code frequency basis of NCO + d_code_freq_chips = (double)Galileo_E1_CODE_CHIP_RATE_HZ; + // Residual code phase (in chips) + d_rem_code_phase_samples = 0.0; + // Residual carrier phase + d_rem_carr_phase_rad = 0.0; + + // sample synchronization + d_sample_counter = 0; + //d_sample_counter_seconds = 0; + d_acq_sample_stamp = 0; + + d_enable_tracking = false; + d_pull_in = false; + d_last_seg = 0; + + d_current_prn_length_samples = (int)d_vector_length; + + // CN0 estimation and lock detector buffers + d_cn0_estimation_counter = 0; + d_Prompt_buffer = new gr_complex[CN0_ESTIMATION_SAMPLES]; + d_carrier_lock_test = 1; + d_CN0_SNV_dB_Hz = 0; + d_carrier_lock_fail_counter = 0; + d_carrier_lock_threshold = CARRIER_LOCK_THRESHOLD; + + systemName["E"] = std::string("Galileo"); + *d_Very_Early=gr_complex(0,0); + *d_Early=gr_complex(0,0); + *d_Prompt=gr_complex(0,0); + *d_Late=gr_complex(0,0); + *d_Very_Late=gr_complex(0,0); +} + +void galileo_volk_e1_dll_pll_veml_tracking_cc::start_tracking() +{ + d_acq_code_phase_samples = d_acquisition_gnss_synchro->Acq_delay_samples; + d_acq_carrier_doppler_hz = d_acquisition_gnss_synchro->Acq_doppler_hz; + d_acq_sample_stamp = d_acquisition_gnss_synchro->Acq_samplestamp_samples; + + // DLL/PLL filter initialization + d_carrier_loop_filter.initialize(); // initialize the carrier filter + d_code_loop_filter.initialize(); // initialize the code filter + + // generate local reference ALWAYS starting at chip 2 (2 samples per chip) + galileo_e1_code_gen_complex_sampled(&d_ca_code[2], + d_acquisition_gnss_synchro->Signal, + false, + d_acquisition_gnss_synchro->PRN, + 2*Galileo_E1_CODE_CHIP_RATE_HZ, + 0); + // Fill head and tail + d_ca_code[0] = d_ca_code[(int)(2*Galileo_E1_B_CODE_LENGTH_CHIPS)]; + d_ca_code[1] = d_ca_code[(int)(2*Galileo_E1_B_CODE_LENGTH_CHIPS + 1)]; + d_ca_code[(int)(2*Galileo_E1_B_CODE_LENGTH_CHIPS + 2)] = d_ca_code[2]; + d_ca_code[(int)(2*Galileo_E1_B_CODE_LENGTH_CHIPS + 3)] = d_ca_code[3]; + + d_carrier_lock_fail_counter = 0; + d_rem_code_phase_samples = 0.0; + d_rem_carr_phase_rad = 0; + d_acc_carrier_phase_rad = 0; + + d_acc_code_phase_secs = 0; + d_carrier_doppler_hz = d_acq_carrier_doppler_hz; + d_current_prn_length_samples = d_vector_length; + + std::string sys_ = &d_acquisition_gnss_synchro->System; + sys = sys_.substr(0, 1); + + // DEBUG OUTPUT + std::cout << "Tracking start on channel " << d_channel << " for satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) << std::endl; + LOG(INFO) << "Starting tracking of satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) << " on channel " << d_channel; + + // enable tracking + d_pull_in = true; + d_enable_tracking = true; + + LOG(INFO) << "PULL-IN Doppler [Hz]=" << d_carrier_doppler_hz + << " PULL-IN Code Phase [samples]=" << d_acq_code_phase_samples; +} + + +void galileo_volk_e1_dll_pll_veml_tracking_cc::update_local_code() +{ + double tcode_half_chips; + float rem_code_phase_half_chips; + int code_length_half_chips = (int)(2*Galileo_E1_B_CODE_LENGTH_CHIPS); + double code_phase_step_chips; + double code_phase_step_half_chips; + int early_late_spc_samples; + int very_early_late_spc_samples; + int epl_loop_length_samples; + + // unified loop for VE, E, P, L, VL code vectors + code_phase_step_chips = ((double)d_code_freq_chips) / ((double)d_fs_in); + code_phase_step_half_chips = (2.0*(double)d_code_freq_chips) / ((double)d_fs_in); + + rem_code_phase_half_chips = d_rem_code_phase_samples * (2*d_code_freq_chips / d_fs_in); + tcode_half_chips = -(double)rem_code_phase_half_chips; + + early_late_spc_samples = round(d_early_late_spc_chips / code_phase_step_chips); + very_early_late_spc_samples = round(d_very_early_late_spc_chips / code_phase_step_chips); + + epl_loop_length_samples = d_current_prn_length_samples + very_early_late_spc_samples*2; + + //HERE YOU CAN CHOOSE THE DESIRED VOLK IMPLEMENTATION + //volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_manual(d_very_early_code, (float) d_very_early_late_spc_chips, (float) code_length_half_chips, (float) code_phase_step_half_chips, (float) tcode_half_chips, d_ca_code, epl_loop_length_samples, "generic"); + + volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_manual(d_very_early_code, (float) d_very_early_late_spc_chips, (float) code_length_half_chips, (float) code_phase_step_half_chips, (float) tcode_half_chips, d_ca_code, epl_loop_length_samples, "u_sse4_1"); + + memcpy(d_early_code, &d_very_early_code[very_early_late_spc_samples - early_late_spc_samples], d_current_prn_length_samples* sizeof(gr_complex)); + memcpy(d_prompt_code, &d_very_early_code[very_early_late_spc_samples], d_current_prn_length_samples* sizeof(gr_complex)); + memcpy(d_late_code, &d_very_early_code[very_early_late_spc_samples + early_late_spc_samples], d_current_prn_length_samples* sizeof(gr_complex)); + memcpy(d_very_late_code, &d_very_early_code[2*very_early_late_spc_samples], d_current_prn_length_samples* sizeof(gr_complex)); +} + +void galileo_volk_e1_dll_pll_veml_tracking_cc::update_local_carrier() +{ + float phase_rad, phase_step_rad; + // Compute the carrier phase step for the K-1 carrier doppler estimation + phase_step_rad = (float)GPS_TWO_PI*d_carrier_doppler_hz / (float)d_fs_in; + // Initialize the carrier phase with the remanent carrier phase of the K-2 loop + phase_rad = d_rem_carr_phase_rad; + + //HERE YOU CAN CHOOSE THE DESIRED VOLK IMPLEMENTATION + //volk_gnsssdr_s32f_x2_update_local_carrier_32fc_manual(d_carr_sign, phase_rad, phase_step_rad, d_current_prn_length_samples, "generic"); + + //volk_gnsssdr_s32f_x2_update_local_carrier_32fc_manual(d_carr_sign, phase_rad, phase_step_rad, d_current_prn_length_samples, "u_sse2"); + + volk_gnsssdr_s32f_x2_update_local_carrier_32fc_manual(d_carr_sign, phase_rad, phase_step_rad, d_current_prn_length_samples, "u_avx"); +} + +galileo_volk_e1_dll_pll_veml_tracking_cc::~galileo_volk_e1_dll_pll_veml_tracking_cc() +{ + d_dump_file.close(); + + volk_free(d_very_early_code); + volk_free(d_early_code); + volk_free(d_prompt_code); + volk_free(d_late_code); + volk_free(d_very_late_code); + volk_free(d_carr_sign); + volk_free(d_Very_Early); + volk_free(d_Early); + volk_free(d_Prompt); + volk_free(d_Late); + volk_free(d_Very_Late); + + volk_free(d_very_early_code16); + volk_free(d_early_code16); + volk_free(d_prompt_code16); + volk_free(d_late_code16); + volk_free(d_very_late_code16); + volk_free(d_carr_sign16); + volk_free(in16); + + volk_free(d_very_early_code8); + volk_free(d_early_code8); + volk_free(d_prompt_code8); + volk_free(d_late_code8); + volk_free(d_very_late_code8); + volk_free(d_carr_sign8); + volk_free(in8); + + delete[] d_ca_code; + delete[] d_Prompt_buffer; +} + + + +int galileo_volk_e1_dll_pll_veml_tracking_cc::general_work (int noutput_items,gr_vector_int &ninput_items, + gr_vector_const_void_star &input_items, gr_vector_void_star &output_items) +{ + float carr_error_hz; + float carr_error_filt_hz; + float code_error_chips; + float code_error_filt_chips; + + if (d_enable_tracking == true) + { + if (d_pull_in == true) + { + /* + * Signal alignment (skip samples until the incoming signal is aligned with local replica) + */ + int samples_offset; + float acq_trk_shif_correction_samples; + int acq_to_trk_delay_samples; + acq_to_trk_delay_samples = d_sample_counter - d_acq_sample_stamp; + acq_trk_shif_correction_samples = d_current_prn_length_samples - fmod((float)acq_to_trk_delay_samples, (float)d_current_prn_length_samples); + samples_offset = round(d_acq_code_phase_samples + acq_trk_shif_correction_samples); + d_sample_counter = d_sample_counter + samples_offset; //count for the processed samples + d_pull_in = false; + consume_each(samples_offset); //shift input to perform alignment with local replica + return 1; + } + + // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder + Gnss_Synchro current_synchro_data; + // Fill the acquisition data + current_synchro_data = *d_acquisition_gnss_synchro; + + // Block input data and block output stream pointers + const gr_complex* in = (gr_complex*) input_items[0]; + Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0]; + + // Generate local code and carrier replicas (using \hat{f}_d(k-1)) + update_local_code(); + update_local_carrier(); + + //HERE YOU CAN CHOOSE THE DESIRED VOLK IMPLEMENTATION + + //Float implementation: + + //volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_manual(d_Very_Early, d_Early, d_Prompt, d_Late, d_Very_Late, in, d_carr_sign, d_very_early_code, d_early_code, d_prompt_code, d_late_code, d_very_late_code, d_current_prn_length_samples, "generic"); + + //volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_manual(d_Very_Early, d_Early, d_Prompt, d_Late, d_Very_Late, in, d_carr_sign, d_very_early_code, d_early_code, d_prompt_code, d_late_code, d_very_late_code, d_current_prn_length_samples, "u_avx"); + + //Integer 16 bits implementation + /*volk_gnsssdr_32fc_convert_16ic(d_very_early_code16, d_very_early_code, d_current_prn_length_samples); + volk_gnsssdr_32fc_convert_16ic(d_early_code16, d_early_code, d_current_prn_length_samples); + volk_gnsssdr_32fc_convert_16ic(d_prompt_code16, d_prompt_code, d_current_prn_length_samples); + volk_gnsssdr_32fc_convert_16ic(d_late_code16, d_late_code, d_current_prn_length_samples); + volk_gnsssdr_32fc_convert_16ic(d_very_late_code16, d_very_late_code, d_current_prn_length_samples); + volk_gnsssdr_32fc_convert_16ic(in16, in, d_current_prn_length_samples); + volk_gnsssdr_32fc_convert_16ic(d_carr_sign16, d_carr_sign, d_current_prn_length_samples); + + volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5(d_Very_Early, d_Early, d_Prompt, d_Late, d_Very_Late, in16, d_carr_sign16, d_very_early_code16, d_early_code16, d_prompt_code16, d_late_code16, d_very_late_code16, d_current_prn_length_samples);*/ + + //Integer 8 bits implementation + volk_gnsssdr_32fc_convert_8ic_manual(d_very_early_code8, d_very_early_code, d_current_prn_length_samples,"u_sse2"); + volk_gnsssdr_32fc_convert_8ic_manual(d_early_code8, d_early_code, d_current_prn_length_samples,"u_sse2"); + volk_gnsssdr_32fc_convert_8ic_manual(d_prompt_code8, d_prompt_code, d_current_prn_length_samples,"u_sse2"); + volk_gnsssdr_32fc_convert_8ic_manual(d_late_code8, d_late_code, d_current_prn_length_samples,"u_sse2"); + volk_gnsssdr_32fc_convert_8ic_manual(d_very_late_code8, d_very_late_code, d_current_prn_length_samples,"u_sse2"); + volk_gnsssdr_32fc_convert_8ic_manual(d_carr_sign8, d_carr_sign, d_current_prn_length_samples,"u_sse2"); + volk_gnsssdr_32fc_s32f_convert_8ic_manual(in8, in, 4, d_current_prn_length_samples,"u_sse2"); + + volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_manual(d_Very_Early, d_Early, d_Prompt, d_Late, d_Very_Late, in8, d_carr_sign8, d_very_early_code8, d_early_code8, d_prompt_code8, d_late_code8, d_very_late_code8, d_current_prn_length_samples, "u_sse4_1"); + + //volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5(d_Very_Early, d_Early, d_Prompt, d_Late, d_Very_Late, in8, d_carr_sign8, d_very_early_code8, d_early_code8, d_prompt_code8, d_late_code8, d_very_late_code8, d_current_prn_length_samples); + + //volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5(d_Very_Early, d_Early, d_Prompt, d_Late, d_Very_Late, in8, d_carr_sign8, d_very_early_code8, d_early_code8, d_prompt_code8, d_late_code8, d_very_late_code8, d_current_prn_length_samples); + + + // ################## PLL ########################################################## + // PLL discriminator + carr_error_hz = pll_cloop_two_quadrant_atan(*d_Prompt) / (float)GPS_TWO_PI; + // Carrier discriminator filter + carr_error_filt_hz = d_carrier_loop_filter.get_carrier_nco(carr_error_hz); + // New carrier Doppler frequency estimation + d_carrier_doppler_hz = d_acq_carrier_doppler_hz + carr_error_filt_hz; + // New code Doppler frequency estimation + d_code_freq_chips = Galileo_E1_CODE_CHIP_RATE_HZ + ((d_carrier_doppler_hz * Galileo_E1_CODE_CHIP_RATE_HZ) / Galileo_E1_FREQ_HZ); + //carrier phase accumulator for (K) Doppler estimation + d_acc_carrier_phase_rad = d_acc_carrier_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * Galileo_E1_CODE_PERIOD; + //remnant carrier phase to prevent overflow in the code NCO + d_rem_carr_phase_rad = d_rem_carr_phase_rad + GPS_TWO_PI * d_carrier_doppler_hz * Galileo_E1_CODE_PERIOD; + d_rem_carr_phase_rad = fmod(d_rem_carr_phase_rad, GPS_TWO_PI); + + // ################## DLL ########################################################## + // DLL discriminator + code_error_chips = dll_nc_vemlp_normalized(*d_Very_Early, *d_Early, *d_Late, *d_Very_Late); //[chips/Ti] + // Code discriminator filter + code_error_filt_chips = d_code_loop_filter.get_code_nco(code_error_chips); //[chips/second] + //Code phase accumulator + float code_error_filt_secs; + code_error_filt_secs = (Galileo_E1_CODE_PERIOD * code_error_filt_chips) / Galileo_E1_CODE_CHIP_RATE_HZ; //[seconds] + //code_error_filt_secs=T_prn_seconds*code_error_filt_chips*T_chip_seconds*(float)d_fs_in; //[seconds] + d_acc_code_phase_secs = d_acc_code_phase_secs + code_error_filt_secs; + + // ################## CARRIER AND CODE NCO BUFFER ALIGNEMENT ####################### + // keep alignment parameters for the next input buffer + double T_chip_seconds; + double T_prn_seconds; + double T_prn_samples; + double K_blk_samples; + // Compute the next buffer lenght based in the new period of the PRN sequence and the code phase error estimation + T_chip_seconds = 1 / (double)d_code_freq_chips; + T_prn_seconds = T_chip_seconds * Galileo_E1_B_CODE_LENGTH_CHIPS; + T_prn_samples = T_prn_seconds * (double)d_fs_in; + K_blk_samples = T_prn_samples + d_rem_code_phase_samples + code_error_filt_secs * (double)d_fs_in; + d_current_prn_length_samples = round(K_blk_samples); //round to a discrete samples + //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample + + // ####### CN0 ESTIMATION AND LOCK DETECTORS ###### + if (d_cn0_estimation_counter < CN0_ESTIMATION_SAMPLES) + { + // fill buffer with prompt correlator output values + d_Prompt_buffer[d_cn0_estimation_counter] = *d_Prompt; + d_cn0_estimation_counter++; + } + else + { + d_cn0_estimation_counter = 0; + + // Code lock indicator + d_CN0_SNV_dB_Hz = cn0_svn_estimator(d_Prompt_buffer, CN0_ESTIMATION_SAMPLES, d_fs_in, Galileo_E1_B_CODE_LENGTH_CHIPS); + + // Carrier lock indicator + d_carrier_lock_test = carrier_lock_detector(d_Prompt_buffer, CN0_ESTIMATION_SAMPLES); + + // Loss of lock detection + if (d_carrier_lock_test < d_carrier_lock_threshold or d_CN0_SNV_dB_Hz < MINIMUM_VALID_CN0) + { + d_carrier_lock_fail_counter++; + } + else + { + if (d_carrier_lock_fail_counter > 0) d_carrier_lock_fail_counter--; + } + if (d_carrier_lock_fail_counter > MAXIMUM_LOCK_FAIL_COUNTER) + { + std::cout << "Loss of lock in channel " << d_channel << "!" << std::endl; + LOG(INFO) << "Loss of lock in channel " << d_channel << "!"; + std::unique_ptr cmf(new ControlMessageFactory()); + if (d_queue != gr::msg_queue::sptr()) + { + d_queue->handle(cmf->GetQueueMessage(d_channel, 2)); + } + d_carrier_lock_fail_counter = 0; + d_enable_tracking = false; // TODO: check if disabling tracking is consistent with the channel state machine + } + } + + // ########### Output the tracking results to Telemetry block ########## + + current_synchro_data.Prompt_I = (double)(*d_Prompt).real(); + current_synchro_data.Prompt_Q = (double)(*d_Prompt).imag(); + + // Tracking_timestamp_secs is aligned with the NEXT PRN start sample (Hybridization problem!) + //compute remnant code phase samples BEFORE the Tracking timestamp + //d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample + //current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter + + // (double)d_current_prn_length_samples + (double)d_rem_code_phase_samples) / (double)d_fs_in; + + // Tracking_timestamp_secs is aligned with the CURRENT PRN start sample (Hybridization OK!, but some glitches??) + current_synchro_data.Tracking_timestamp_secs = ((double)d_sample_counter + (double)d_rem_code_phase_samples) / (double)d_fs_in; + //compute remnant code phase samples AFTER the Tracking timestamp + d_rem_code_phase_samples = K_blk_samples - d_current_prn_length_samples; //rounding error < 1 sample + + // This tracking block aligns the Tracking_timestamp_secs with the start sample of the PRN, thus, Code_phase_secs=0 + current_synchro_data.Code_phase_secs = 0; + current_synchro_data.Carrier_phase_rads = (double)d_acc_carrier_phase_rad; + current_synchro_data.Carrier_Doppler_hz = (double)d_carrier_doppler_hz; + current_synchro_data.CN0_dB_hz = (double)d_CN0_SNV_dB_Hz; + *out[0] = current_synchro_data; + + // ########## DEBUG OUTPUT + /*! + * \todo The stop timer has to be moved to the signal source! + */ + // stream to collect cout calls to improve thread safety + std::stringstream tmp_str_stream; + if (floor(d_sample_counter / d_fs_in) != d_last_seg) + { + d_last_seg = floor(d_sample_counter / d_fs_in); + + if (d_channel == 0) + { + // debug: Second counter in channel 0 + tmp_str_stream << "Current input signal time = " << d_last_seg << " [s]" << std::endl << std::flush; + std::cout << tmp_str_stream.rdbuf() << std::flush; + } + + tmp_str_stream << "Tracking CH " << d_channel << ": Satellite " << Gnss_Satellite(systemName[sys], d_acquisition_gnss_synchro->PRN) + << ", Doppler=" << d_carrier_doppler_hz << " [Hz] CN0 = " << d_CN0_SNV_dB_Hz << " [dB-Hz]" << std::endl; + LOG(INFO) << tmp_str_stream.rdbuf() << std::flush; + //if (d_channel == 0 || d_last_seg==5) d_carrier_lock_fail_counter=500; //DEBUG: force unlock! + } + } + else + { + // ########## DEBUG OUTPUT (TIME ONLY for channel 0 when tracking is disabled) + /*! + * \todo The stop timer has to be moved to the signal source! + */ + // stream to collect cout calls to improve thread safety + std::stringstream tmp_str_stream; + if (floor(d_sample_counter / d_fs_in) != d_last_seg) + { + d_last_seg = floor(d_sample_counter / d_fs_in); + + if (d_channel == 0) + { + // debug: Second counter in channel 0 + tmp_str_stream << "Current input signal time = " << d_last_seg << " [s]" << std::endl << std::flush; + std::cout << tmp_str_stream.rdbuf() << std::flush; + } + } + *d_Early = gr_complex(0,0); + *d_Prompt = gr_complex(0,0); + *d_Late = gr_complex(0,0); + Gnss_Synchro **out = (Gnss_Synchro **) &output_items[0]; //block output stream pointer + // GNSS_SYNCHRO OBJECT to interchange data between tracking->telemetry_decoder + *out[0] = *d_acquisition_gnss_synchro; + } + + if(d_dump) + { + // Dump results to file + float prompt_I; + float prompt_Q; + float tmp_VE, tmp_E, tmp_P, tmp_L, tmp_VL; + float tmp_float; + double tmp_double; + prompt_I = (*d_Prompt).real(); + prompt_Q = (*d_Prompt).imag(); + tmp_VE = std::abs(*d_Very_Early); + tmp_E = std::abs(*d_Early); + tmp_P = std::abs(*d_Prompt); + tmp_L = std::abs(*d_Late); + tmp_VL = std::abs(*d_Very_Late); + + try + { + // Dump correlators output + d_dump_file.write((char*)&tmp_VE, sizeof(float)); + d_dump_file.write((char*)&tmp_E, sizeof(float)); + d_dump_file.write((char*)&tmp_P, sizeof(float)); + d_dump_file.write((char*)&tmp_L, sizeof(float)); + d_dump_file.write((char*)&tmp_VL, sizeof(float)); + // PROMPT I and Q (to analyze navigation symbols) + d_dump_file.write((char*)&prompt_I, sizeof(float)); + d_dump_file.write((char*)&prompt_Q, sizeof(float)); + // PRN start sample stamp + d_dump_file.write((char*)&d_sample_counter, sizeof(unsigned long int)); + // accumulated carrier phase + d_dump_file.write((char*)&d_acc_carrier_phase_rad, sizeof(float)); + // carrier and code frequency + d_dump_file.write((char*)&d_carrier_doppler_hz, sizeof(float)); + d_dump_file.write((char*)&d_code_freq_chips, sizeof(float)); + //PLL commands + d_dump_file.write((char*)&carr_error_hz, sizeof(float)); + d_dump_file.write((char*)&carr_error_filt_hz, sizeof(float)); + //DLL commands + d_dump_file.write((char*)&code_error_chips, sizeof(float)); + d_dump_file.write((char*)&code_error_filt_chips, sizeof(float)); + // CN0 and carrier lock test + d_dump_file.write((char*)&d_CN0_SNV_dB_Hz, sizeof(float)); + d_dump_file.write((char*)&d_carrier_lock_test, sizeof(float)); + // AUX vars (for debug purposes) + tmp_float = d_rem_code_phase_samples; + d_dump_file.write((char*)&tmp_float, sizeof(float)); + tmp_double=(double)(d_sample_counter+d_current_prn_length_samples); + d_dump_file.write((char*)&tmp_double, sizeof(double)); + } + catch (std::ifstream::failure e) + { + LOG(WARNING) << "Exception writing trk dump file " << e.what() << std::endl; + } + } + consume_each(d_current_prn_length_samples); // this is required for gr_block derivates + d_sample_counter += d_current_prn_length_samples; //count for the processed samples + //std::cout<<"Galileo tracking output at sample "<(d_channel)); + d_dump_filename.append(".dat"); + d_dump_file.exceptions (std::ifstream::failbit | std::ifstream::badbit); + d_dump_file.open(d_dump_filename.c_str(), std::ios::out | std::ios::binary); + LOG(INFO) << "Tracking dump enabled on channel " << d_channel << " Log file: " << d_dump_filename.c_str(); + } + catch (std::ifstream::failure e) + { + LOG(WARNING) << "channel " << d_channel << " Exception opening trk dump file " << e.what() << std::endl; + } + } + } +} + + + +void galileo_volk_e1_dll_pll_veml_tracking_cc::set_channel_queue(concurrent_queue *channel_internal_queue) +{ + d_channel_internal_queue = channel_internal_queue; +} + + + +void galileo_volk_e1_dll_pll_veml_tracking_cc::set_gnss_synchro(Gnss_Synchro* p_gnss_synchro) +{ + d_acquisition_gnss_synchro = p_gnss_synchro; + // Gnss_Satellite(satellite.get_system(), satellite.get_PRN()); + //DLOG(INFO) << "Tracking code phase set to " << d_acq_code_phase_samples; + //DLOG(INFO) << "Tracking carrier doppler set to " << d_acq_carrier_doppler_hz; + //DLOG(INFO) << "Tracking Satellite set to " << d_satellite; +} diff --git a/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.h b/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.h new file mode 100644 index 000000000..0c1e1b454 --- /dev/null +++ b/src/algorithms/tracking/gnuradio_blocks/galileo_volk_e1_dll_pll_veml_tracking_cc.h @@ -0,0 +1,211 @@ +/*! + * \file galileo_volk_e1_dll_pll_veml_tracking_cc.h + * \brief Implementation of a code DLL + carrier PLL VEML (Very Early + * Minus Late) tracking block for Galileo E1 signals + * \author Luis Esteve, 2012. luis(at)epsilon-formacion.com + * + * ------------------------------------------------------------------------- + * + * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors) + * + * GNSS-SDR is a software defined Global Navigation + * Satellite Systems receiver + * + * This file is part of GNSS-SDR. + * + * GNSS-SDR is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * at your option) any later version. + * + * GNSS-SDR is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNSS-SDR. If not, see . + * + * ------------------------------------------------------------------------- + */ + +#ifndef GNSS_SDR_GALILEO_VOLK_E1_DLL_PLL_VEML_TRACKING_CC_H +#define GNSS_SDR_GALILEO_VOLK_E1_DLL_PLL_VEML_TRACKING_CC_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrent_queue.h" +#include "gnss_synchro.h" +#include "tracking_2nd_DLL_filter.h" +#include "tracking_2nd_PLL_filter.h" +#include "correlator.h" + +class galileo_volk_e1_dll_pll_veml_tracking_cc; + +typedef boost::shared_ptr galileo_volk_e1_dll_pll_veml_tracking_cc_sptr; + +galileo_volk_e1_dll_pll_veml_tracking_cc_sptr +galileo_volk_e1_dll_pll_veml_make_tracking_cc(long if_freq, + long fs_in, unsigned + int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips, + float very_early_late_space_chips); + +/*! + * \brief This class implements a code DLL + carrier PLL VEML (Very Early + * Minus Late) tracking block for Galileo E1 signals + */ +class galileo_volk_e1_dll_pll_veml_tracking_cc: public gr::block +{ +public: + ~galileo_volk_e1_dll_pll_veml_tracking_cc(); + + void set_channel(unsigned int channel); + void set_gnss_synchro(Gnss_Synchro* p_gnss_synchro); + void start_tracking(); + void set_channel_queue(concurrent_queue *channel_internal_queue); + + /*! + * \brief Code DLL + carrier PLL according to the algorithms described in: + * K.Borre, D.M.Akos, N.Bertelsen, P.Rinder, and S.H.Jensen, + * A Software-Defined GPS and Galileo Receiver. A Single-Frequency Approach, + * Birkhauser, 2007 + */ + int general_work (int noutput_items, gr_vector_int &ninput_items, + gr_vector_const_void_star &input_items, gr_vector_void_star &output_items); + + void forecast (int noutput_items, gr_vector_int &ninput_items_required); +private: + friend galileo_volk_e1_dll_pll_veml_tracking_cc_sptr + galileo_volk_e1_dll_pll_veml_make_tracking_cc(long if_freq, + long fs_in, unsigned + int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips, + float very_early_late_space_chips); + + galileo_volk_e1_dll_pll_veml_tracking_cc(long if_freq, + long fs_in, unsigned + int vector_length, + boost::shared_ptr queue, + bool dump, + std::string dump_filename, + float pll_bw_hz, + float dll_bw_hz, + float early_late_space_chips, + float very_early_late_space_chips); + + void update_local_code(); + + void update_local_carrier(); + + // tracking configuration vars + boost::shared_ptr d_queue; + concurrent_queue *d_channel_internal_queue; + unsigned int d_vector_length; + bool d_dump; + + Gnss_Synchro* d_acquisition_gnss_synchro; + unsigned int d_channel; + int d_last_seg; + long d_if_freq; + long d_fs_in; + + float d_early_late_spc_chips; + float d_very_early_late_spc_chips; + + gr_complex* d_ca_code; + + gr_complex* d_very_early_code; + gr_complex* d_early_code; + gr_complex* d_prompt_code; + gr_complex* d_late_code; + gr_complex* d_very_late_code; + gr_complex* d_carr_sign; + + lv_16sc_t* d_very_early_code16; + lv_16sc_t* d_early_code16; + lv_16sc_t* d_prompt_code16; + lv_16sc_t* d_late_code16; + lv_16sc_t* d_very_late_code16; + lv_16sc_t* d_carr_sign16; + lv_16sc_t* in16; + + lv_8sc_t* d_very_early_code8; + lv_8sc_t* d_early_code8; + lv_8sc_t* d_prompt_code8; + lv_8sc_t* d_late_code8; + lv_8sc_t* d_very_late_code8; + lv_8sc_t* d_carr_sign8; + lv_8sc_t* in8; + + gr_complex *d_Very_Early; + gr_complex *d_Early; + gr_complex *d_Prompt; + gr_complex *d_Late; + gr_complex *d_Very_Late; + + // remaining code phase and carrier phase between tracking loops + float d_rem_code_phase_samples; + float d_rem_carr_phase_rad; + + // PLL and DLL filter library + Tracking_2nd_DLL_filter d_code_loop_filter; + Tracking_2nd_PLL_filter d_carrier_loop_filter; + + // acquisition + float d_acq_code_phase_samples; + float d_acq_carrier_doppler_hz; + + // correlator + Correlator d_correlator; + + // tracking vars + float d_code_freq_chips; + float d_carrier_doppler_hz; + double d_acc_carrier_phase_rad; + double d_acc_code_phase_secs; + + //PRN period in samples + int d_current_prn_length_samples; + + //processing samples counters + unsigned long int d_sample_counter; + unsigned long int d_acq_sample_stamp; + + // CN0 estimation and lock detector + int d_cn0_estimation_counter; + gr_complex* d_Prompt_buffer; + float d_carrier_lock_test; + float d_CN0_SNV_dB_Hz; + float d_carrier_lock_threshold; + int d_carrier_lock_fail_counter; + + // control vars + bool d_enable_tracking; + bool d_pull_in; + + // file dump + std::string d_dump_filename; + std::ofstream d_dump_file; + + std::map systemName; + std::string sys; +}; + +#endif //GNSS_SDR_GALILEO_VOLK_E1_DLL_PLL_VEML_TRACKING_CC_H diff --git a/src/core/receiver/gnss_block_factory.cc b/src/core/receiver/gnss_block_factory.cc index 2acbb3090..c3b77589b 100644 --- a/src/core/receiver/gnss_block_factory.cc +++ b/src/core/receiver/gnss_block_factory.cc @@ -73,6 +73,7 @@ #include "gps_l1_ca_dll_fll_pll_tracking.h" #include "gps_l1_ca_tcp_connector_tracking.h" #include "galileo_e1_dll_pll_veml_tracking.h" +#include "galileo_volk_e1_dll_pll_veml_tracking.h" #include "galileo_e1_tcp_connector_tracking.h" #include "galileo_e5a_dll_pll_tracking.h" #include "gps_l1_ca_telemetry_decoder.h" @@ -575,6 +576,12 @@ std::unique_ptr GNSSBlockFactory::GetBlock( out_streams, queue)); block = std::move(block_); } + else if (implementation.compare("Galileo_volk_E1_DLL_PLL_VEML_Tracking") == 0) + { + std::unique_ptr block_(new GalileoVolkE1DllPllVemlTracking(configuration.get(), role, in_streams, + out_streams, queue)); + block = std::move(block_); + } else if (implementation.compare("Galileo_E1_TCP_CONNECTOR_Tracking") == 0) { std::unique_ptr block_(new GalileoE1TcpConnectorTracking(configuration.get(), role, in_streams, @@ -822,6 +829,12 @@ std::unique_ptr GNSSBlockFactory::GetTrkBlock( out_streams, queue)); block = std::move(block_); } + else if (implementation.compare("Galileo_Volk_E1_DLL_PLL_VEML_Tracking") == 0) + { + std::unique_ptr block_(new GalileoVolkE1DllPllVemlTracking(configuration.get(), role, in_streams, + out_streams, queue)); + block = std::move(block_); + } else if (implementation.compare("Galileo_E1_TCP_CONNECTOR_Tracking") == 0) { std::unique_ptr block_(new GalileoE1TcpConnectorTracking(configuration.get(), role, in_streams,