1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-12-15 04:30:33 +00:00

Merge branch 'volk_tracking_performance' into volk_merge_with_next

This commit is contained in:
andres 2014-10-18 02:42:01 +02:00
commit 5a498207c4
141 changed files with 83671 additions and 1 deletions

View File

@ -333,6 +333,40 @@ if(NOT GNURADIO_TRELLIS_FOUND)
endif()
###############################################################################
# Volk_gnsssdr module
#In order to use volk_gnsssr module it is necessary to add:
# 1) include_directories(..${VOLK_GNSSSDR_INCLUDE_DIRS}..)
# 2) target_link_libraries(..${VOLK_GNSSSDR_LIBRARIES}..)
###############################################################################
if(ENABLE_VOLK_GNSSSDR)
message(STATUS "The volk_gnsssdr module with custom protokernels coded by gnss-sdr will be compiled.")
message(STATUS "You can disable it with 'cmake -DENABLE_VOLK_GNSSSDR=OFF ../'" )
else(ENABLE_VOLK_GNSSSDR)
message(STATUS "The volk_gnsssdr module with custom protokernels coded by gnss-sdr is not enabled. Some configurations that use custom protokernels will not work." )
message(STATUS "Enable it with 'cmake -D ENABLE_VOLK_GNSSSDR=ON ../'." )
endif(ENABLE_VOLK_GNSSSDR)
if(ENABLE_VOLK_GNSSSDR)
set(VOLK_GNSSSDR_BASE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/src/algorithms/libs/volk_gnsssdr)
add_subdirectory(${VOLK_GNSSSDR_BASE_PATH})
set(VOLK_GNSSSDR_INCLUDE_DIRS
${VOLK_GNSSSDR_BASE_PATH}/include
${CMAKE_CURRENT_BINARY_DIR}/src/algorithms/libs/volk_gnsssdr/include
)
set(VOLK_GNSSSDR_LIBRARIES
#Path to libs of volk_gnsssdr target: ${VOLK_GNSSSDR_BASE_PATH}/lib/Debug/libvolk_gnsssdr.dylib
volk_gnsssdr
)
message(" * INCLUDES: ${VOLK_GNSSSDR_INCLUDE_DIRS} ")
message(" * LIBS: ${VOLK_GNSSSDR_LIBRARIES} ")
message("-- END OF: Setup volk_gnsssdr as a subproject.")
endif(ENABLE_VOLK_GNSSSDR)
################################################################################
# gflags - http://code.google.com/p/gflags/

View File

@ -0,0 +1,183 @@
#
# Copyright 2011 Free Software Foundation, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
########################################################################
# Project setup
########################################################################
cmake_minimum_required(VERSION 2.6)
if(NOT DEFINED CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
set(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "Choose build type: None Debug Release RelWithDebInfo MinSizeRel")
project(volk_gnsssdr)
enable_language(CXX)
enable_language(C)
enable_testing()
set(VERSION 0.1)
set(LIBVER 0.0.0)
set(CMAKE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) #allows this to be a sub-project
set(CMAKE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) #allows this to be a sub-project
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) #location for custom "Modules"
########################################################################
# Environment setup
########################################################################
IF(NOT DEFINED BOOST_ROOT)
SET(BOOST_ROOT ${CMAKE_INSTALL_PREFIX})
ENDIF()
IF(NOT DEFINED CROSSCOMPILE_MULTILIB)
SET(CROSSCOMPILE_MULTILIB "")
ENDIF()
SET(CROSSCOMPILE_MULTILIB ${CROSSCOMPILE_MULTILIB} CACHE STRING "Define \"true\" if you have and want to use multiple C development libs installed for cross compile")
########################################################################
# Dependencies setup
########################################################################
include(GrPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B
VOLK_PYTHON_CHECK_MODULE("python >= 2.5" sys "sys.version.split()[0] >= '2.5'" PYTHON_MIN_VER_FOUND)
VOLK_PYTHON_CHECK_MODULE("Cheetah >= 2.0.0" Cheetah "Cheetah.Version >= '2.0.0'" CHEETAH_FOUND)
if(NOT PYTHON_MIN_VER_FOUND)
message(FATAL_ERROR "Python 2.5 or greater required to build VOLK")
endif()
if(NOT CHEETAH_FOUND)
message(FATAL_ERROR "Cheetah templates required to build VOLK")
endif()
if(MSVC)
if (NOT DEFINED BOOST_ALL_DYN_LINK)
set(BOOST_ALL_DYN_LINK TRUE)
endif()
set(BOOST_ALL_DYN_LINK "${BOOST_ALL_DYN_LINK}" CACHE BOOL "boost enable dynamic linking")
if(BOOST_ALL_DYN_LINK)
add_definitions(-DBOOST_ALL_DYN_LINK) #setup boost auto-linking in msvc
else(BOOST_ALL_DYN_LINK)
unset(BOOST_REQUIRED_COMPONENTS) #empty components list for static link
endif(BOOST_ALL_DYN_LINK)
endif(MSVC)
include(VolkBoost)
if(NOT Boost_FOUND)
message(FATAL_ERROR "VOLK Requires boost to build")
endif()
option(ENABLE_ORC "Enable Orc" True)
if(ENABLE_ORC)
find_package(ORC)
else(ENABLE_ORC)
message(STATUS "Disabling use of ORC")
endif(ENABLE_ORC)
########################################################################
# Setup the package config file
########################################################################
#set variables found in the pc.in file
set(prefix ${CMAKE_INSTALL_PREFIX})
set(exec_prefix "\${prefix}")
set(libdir "\${exec_prefix}/lib${LIB_SUFFIX}")
set(includedir "\${prefix}/include")
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr.pc.in
${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr.pc
@ONLY)
install(
FILES ${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr.pc
DESTINATION lib${LIB_SUFFIX}/pkgconfig
COMPONENT "volk_gnsssdr_devel"
)
########################################################################
# Install all headers in the include directories
########################################################################
set(VOLK_RUNTIME_DIR bin)
set(VOLK_LIBRARY_DIR lib${LIB_SUFFIX})
set(VOLK_INCLUDE_DIR include)
install(
DIRECTORY ${CMAKE_SOURCE_DIR}/kernels/volk_gnsssdr
DESTINATION include COMPONENT "volk_gnsssdr_devel"
FILES_MATCHING PATTERN "*.h"
)
install(FILES
${CMAKE_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_prefs.h
${CMAKE_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_complex.h
${CMAKE_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_common.h
${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr.h
${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_cpu.h
${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_config_fixed.h
${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_typedefs.h
${CMAKE_SOURCE_DIR}/include/volk_gnsssdr/volk_gnsssdr_malloc.h
DESTINATION include/volk_gnsssdr
COMPONENT "volk_gnsssdr_devel"
)
########################################################################
# Install cmake search routine for external use
########################################################################
if(NOT CMAKE_MODULES_DIR)
set(CMAKE_MODULES_DIR lib${LIB_SUFFIX}/cmake)
endif(NOT CMAKE_MODULES_DIR)
install(
FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/VolkConfig.cmake
DESTINATION ${CMAKE_MODULES_DIR}/volk_gnsssdr
COMPONENT "volk_gnsssdr_devel"
)
########################################################################
# On Apple only, set install name and use rpath correctly, if not already set
########################################################################
if(APPLE)
if(NOT CMAKE_INSTALL_NAME_DIR)
set(CMAKE_INSTALL_NAME_DIR
${CMAKE_INSTALL_PREFIX}/${GR_LIBRARY_DIR} CACHE
PATH "Library Install Name Destination Directory" FORCE)
endif(NOT CMAKE_INSTALL_NAME_DIR)
if(NOT CMAKE_INSTALL_RPATH)
set(CMAKE_INSTALL_RPATH
${CMAKE_INSTALL_PREFIX}/${GR_LIBRARY_DIR} CACHE
PATH "Library Install RPath" FORCE)
endif(NOT CMAKE_INSTALL_RPATH)
if(NOT CMAKE_BUILD_WITH_INSTALL_RPATH)
set(CMAKE_BUILD_WITH_INSTALL_RPATH ON CACHE
BOOL "Do Build Using Library Install RPath" FORCE)
endif(NOT CMAKE_BUILD_WITH_INSTALL_RPATH)
endif(APPLE)
########################################################################
# Setup the library
########################################################################
add_subdirectory(lib)
########################################################################
# And the utility apps
########################################################################
add_subdirectory(apps)
add_subdirectory(python/volk_gnsssdr_modtool)
########################################################################
# Print summary
########################################################################
message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}")

View File

@ -0,0 +1,61 @@
#
# Copyright 2011-2013 Free Software Foundation, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
########################################################################
# Setup profiler
########################################################################
if(Boost_FOUND)
if(MSVC)
include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc)
endif(MSVC)
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_SOURCE_DIR}/include
${CMAKE_BINARY_DIR}/include
${CMAKE_SOURCE_DIR}/lib
${CMAKE_BINARY_DIR}/lib
${Boost_INCLUDE_DIRS}
)
# MAKE volk_gnsssdr_profile
add_executable(volk_gnsssdr_profile
${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr_profile.cc
${CMAKE_SOURCE_DIR}/lib/qa_utils.cc
)
target_link_libraries(volk_gnsssdr_profile volk_gnsssdr ${Boost_LIBRARIES})
install(
TARGETS volk_gnsssdr_profile
DESTINATION bin
COMPONENT "volk_gnsssdr"
)
# MAKE volk_gnsssdr-config-info
add_executable(volk_gnsssdr-config-info volk_gnsssdr-config-info.cc)
target_link_libraries(volk_gnsssdr-config-info volk_gnsssdr ${Boost_LIBRARIES})
install(
TARGETS volk_gnsssdr-config-info
DESTINATION bin
COMPONENT "volk_gnsssdr"
)
endif(Boost_FOUND)

View File

@ -0,0 +1,96 @@
/* -*- c++ -*- */
/*
* Copyright 2013 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include <config.h>
#endif
#include <volk_gnsssdr/constants.h>
#include "volk_gnsssdr/volk_gnsssdr.h"
#include <boost/program_options.hpp>
#include <iostream>
namespace po = boost::program_options;
int
main(int argc, char **argv)
{
po::options_description desc("Program options: volk_gnsssdr-config-info [options]");
po::variables_map vm;
desc.add_options()
("help,h", "print help message")
("prefix", "print VOLK installation prefix")
("builddate", "print VOLK build date (RFC2822 format)")
("cc", "print VOLK C compiler version")
("cflags", "print VOLK CFLAGS")
("all-machines", "print VOLK machines built into library")
("avail-machines", "print VOLK machines the current platform can use")
("machine", "print the VOLK machine that will be used")
("version,v", "print VOLK version")
;
try {
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);
}
catch (po::error& error){
std::cerr << "Error: " << error.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return 1;
}
if(vm.size() == 0 || vm.count("help")) {
std::cout << desc << std::endl;
return 1;
}
if(vm.count("prefix"))
std::cout << volk_gnsssdr_prefix() << std::endl;
if(vm.count("builddate"))
std::cout << volk_gnsssdr_build_date() << std::endl;
if(vm.count("version"))
std::cout << volk_gnsssdr_version() << std::endl;
if(vm.count("cc"))
std::cout << volk_gnsssdr_c_compiler() << std::endl;
if(vm.count("cflags"))
std::cout << volk_gnsssdr_compiler_flags() << std::endl;
// stick an extra ';' to make output of this and avail-machines the
// same structure for easier parsing
if(vm.count("all-machines"))
std::cout << volk_gnsssdr_available_machines() << ";" << std::endl;
if(vm.count("avail-machines")) {
volk_gnsssdr_list_machines();
}
if(vm.count("machine")) {
std::cout << volk_gnsssdr_get_machine() << std::endl;
}
return 0;
}

View File

@ -0,0 +1,239 @@
/* -*- c++ -*- */
/*
* Copyright 2012-2014 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#include "qa_utils.h"
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <volk_gnsssdr/volk_gnsssdr_prefs.h>
#include <ciso646>
#include <vector>
#include <boost/foreach.hpp>
#include <boost/filesystem.hpp>
#include <boost/program_options.hpp>
#include <iostream>
#include <fstream>
#include <sys/stat.h>
#include <sys/types.h>
namespace fs = boost::filesystem;
void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_t> results) {
json_file << "{" << std::endl;
json_file << " \"volk_tests\": [" << std::endl;
size_t len = results.size();
size_t i = 0;
BOOST_FOREACH(volk_gnsssdr_test_results_t &result, results) {
json_file << " {" << std::endl;
json_file << " \"name\": \"" << result.name << "\"," << std::endl;
json_file << " \"vlen\": " << result.vlen << "," << std::endl;
json_file << " \"iter\": " << result.iter << "," << std::endl;
json_file << " \"best_arch_a\": \"" << result.best_arch_a
<< "\"," << std::endl;
json_file << " \"best_arch_u\": \"" << result.best_arch_u
<< "\"," << std::endl;
json_file << " \"results\": {" << std::endl;
size_t results_len = result.results.size();
size_t ri = 0;
typedef std::pair<std::string, volk_gnsssdr_test_time_t> tpair;
BOOST_FOREACH(tpair pair, result.results) {
volk_gnsssdr_test_time_t time = pair.second;
json_file << " \"" << time.name << "\": {" << std::endl;
json_file << " \"name\": \"" << time.name << "\"," << std::endl;
json_file << " \"time\": " << time.time << "," << std::endl;
json_file << " \"units\": \"" << time.units << "\"" << std::endl;
json_file << " }" ;
if(ri+1 != results_len) {
json_file << ",";
}
json_file << std::endl;
ri++;
}
json_file << " }" << std::endl;
json_file << " }";
if(i+1 != len) {
json_file << ",";
}
json_file << std::endl;
i++;
}
json_file << " ]" << std::endl;
json_file << "}" << std::endl;
}
int main(int argc, char *argv[]) {
// Adding program options
boost::program_options::options_description desc("Options");
desc.add_options()
("help,h", "Print help messages")
("benchmark,b",
boost::program_options::value<bool>()->default_value( false )
->implicit_value( true ),
"Run all kernels (benchmark mode)")
("tests-regex,R",
boost::program_options::value<std::string>(),
"Run tests matching regular expression.")
("json,j",
boost::program_options::value<std::string>(),
"JSON output file")
;
// Handle the options that were given
boost::program_options::variables_map vm;
bool benchmark_mode;
std::string kernel_regex;
bool store_results = true;
std::ofstream json_file;
try {
boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), vm);
boost::program_options::notify(vm);
benchmark_mode = vm.count("benchmark")?vm["benchmark"].as<bool>():false;
if ( vm.count("tests-regex" ) ) {
kernel_regex = vm["tests-regex"].as<std::string>();
store_results = false;
std::cout << "Warning: using a regexp will not save results to a config" << std::endl;
}
else {
kernel_regex = ".*";
store_results = true;
}
} catch (boost::program_options::error& error) {
std::cerr << "Error: " << error.what() << std::endl << std::endl;
std::cerr << desc << std::endl;
return 1;
}
/** --help option
*/
if ( vm.count("help") )
{
std::cout << "The VOLK profiler." << std::endl
<< desc << std::endl;
return 0;
}
if ( vm.count("json") )
{
json_file.open( vm["json"].as<std::string>().c_str() );
}
// Run tests
std::vector<volk_gnsssdr_test_results_t> results;
//VOLK_PROFILE(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_16i_max_star_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_16i_max_star_horizontal_16i, 0, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_16i_permute_and_scalar_add, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_32u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_64u_popcnt, 0, 0, 2046, 10000, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, lv_32fc_t(1.0, 0.5), 204602, 1000, &results, benchmark_mode, kernel_regex);
//GNSS-SDR PROTO-KERNELS
//lv_32fc_t sfv = lv_cmake((float)1, (float)2);
//example: VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, sfv, 204602, 1000, &results, benchmark_mode, kernel_regex);
//CAN NOT BE TESTED YET BECAUSE VOLK MODULE DOES NOT SUPPORT IT:
//VOLK_PROFILE(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 16007, 1, &results, benchmark_mode, kernel_regex);
//VOLK_PROFILE(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 7, 1, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_convert_16ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_convert_8ic, 1e-4, 0, 16000, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_s32f_convert_8ic, 1e-4, 5, 16000, 250, &results, benchmark_mode, kernel_regex);
/*VOLK_PROFILE(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32f_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8i_index_max_16u, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8i_max_s8i, 3, 0, 204602, 5000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 204602, 10000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_64f_accumulator_64f, 1e-4, 0, 16000, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_32f_s32f_convert_16i, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_gnsssdr_16i_s32f_convert_32f, 1e-4, 1, 204602, 250, &results, benchmark_mode, kernel_regex);*/
// Until we can update the config on a kernel by kernel basis
// do not overwrite volk_config when using a regex.
if(store_results) {
char path[1024];
volk_gnsssdr_get_config_path(path);
const fs::path config_path(path);
if (not fs::exists(config_path.branch_path()))
{
std::cout << "Creating " << config_path.branch_path() << "..." << std::endl;
fs::create_directories(config_path.branch_path());
}
std::cout << "Writing " << config_path << "..." << std::endl;
std::ofstream config(config_path.string().c_str());
if(!config.is_open()) { //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << config_path << std::endl;
}
config << "\
#this file is generated by volk_profile.\n\
#the function name is followed by the preferred architecture.\n\
";
BOOST_FOREACH(volk_gnsssdr_test_results_t result, results) {
config << result.config_name << " "
<< result.best_arch_a << " "
<< result.best_arch_u << std::endl;
}
config.close();
}
else {
std::cout << "Warning: config not generated" << std::endl;
}
}

View File

@ -0,0 +1,138 @@
# CMAKE_PARSE_ARGUMENTS(<prefix> <options> <one_value_keywords> <multi_value_keywords> args...)
#
# CMAKE_PARSE_ARGUMENTS() is intended to be used in macros or functions for
# parsing the arguments given to that macro or function.
# It processes the arguments and defines a set of variables which hold the
# values of the respective options.
#
# The <options> argument contains all options for the respective macro,
# i.e. keywords which can be used when calling the macro without any value
# following, like e.g. the OPTIONAL keyword of the install() command.
#
# The <one_value_keywords> argument contains all keywords for this macro
# which are followed by one value, like e.g. DESTINATION keyword of the
# install() command.
#
# The <multi_value_keywords> argument contains all keywords for this macro
# which can be followed by more than one value, like e.g. the TARGETS or
# FILES keywords of the install() command.
#
# When done, CMAKE_PARSE_ARGUMENTS() will have defined for each of the
# keywords listed in <options>, <one_value_keywords> and
# <multi_value_keywords> a variable composed of the given <prefix>
# followed by "_" and the name of the respective keyword.
# These variables will then hold the respective value from the argument list.
# For the <options> keywords this will be TRUE or FALSE.
#
# All remaining arguments are collected in a variable
# <prefix>_UNPARSED_ARGUMENTS, this can be checked afterwards to see whether
# your macro was called with unrecognized parameters.
#
# As an example here a my_install() macro, which takes similar arguments as the
# real install() command:
#
# function(MY_INSTALL)
# set(options OPTIONAL FAST)
# set(oneValueArgs DESTINATION RENAME)
# set(multiValueArgs TARGETS CONFIGURATIONS)
# cmake_parse_arguments(MY_INSTALL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} )
# ...
#
# Assume my_install() has been called like this:
# my_install(TARGETS foo bar DESTINATION bin OPTIONAL blub)
#
# After the cmake_parse_arguments() call the macro will have set the following
# variables:
# MY_INSTALL_OPTIONAL = TRUE
# MY_INSTALL_FAST = FALSE (this option was not used when calling my_install()
# MY_INSTALL_DESTINATION = "bin"
# MY_INSTALL_RENAME = "" (was not used)
# MY_INSTALL_TARGETS = "foo;bar"
# MY_INSTALL_CONFIGURATIONS = "" (was not used)
# MY_INSTALL_UNPARSED_ARGUMENTS = "blub" (no value expected after "OPTIONAL"
#
# You can the continue and process these variables.
#
# Keywords terminate lists of values, e.g. if directly after a one_value_keyword
# another recognized keyword follows, this is interpreted as the beginning of
# the new option.
# E.g. my_install(TARGETS foo DESTINATION OPTIONAL) would result in
# MY_INSTALL_DESTINATION set to "OPTIONAL", but MY_INSTALL_DESTINATION would
# be empty and MY_INSTALL_OPTIONAL would be set to TRUE therefor.
#=============================================================================
# Copyright 2010 Alexander Neundorf <neundorf@kde.org>
#
# Distributed under the OSI-approved BSD License (the "License");
# see accompanying file Copyright.txt for details.
#
# This software is distributed WITHOUT ANY WARRANTY; without even the
# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the License for more information.
#=============================================================================
# (To distribute this file outside of CMake, substitute the full
# License text for the above reference.)
if(__CMAKE_PARSE_ARGUMENTS_INCLUDED)
return()
endif()
set(__CMAKE_PARSE_ARGUMENTS_INCLUDED TRUE)
function(CMAKE_PARSE_ARGUMENTS prefix _optionNames _singleArgNames _multiArgNames)
# first set all result variables to empty/FALSE
foreach(arg_name ${_singleArgNames} ${_multiArgNames})
set(${prefix}_${arg_name})
endforeach(arg_name)
foreach(option ${_optionNames})
set(${prefix}_${option} FALSE)
endforeach(option)
set(${prefix}_UNPARSED_ARGUMENTS)
set(insideValues FALSE)
set(currentArgName)
# now iterate over all arguments and fill the result variables
foreach(currentArg ${ARGN})
list(FIND _optionNames "${currentArg}" optionIndex) # ... then this marks the end of the arguments belonging to this keyword
list(FIND _singleArgNames "${currentArg}" singleArgIndex) # ... then this marks the end of the arguments belonging to this keyword
list(FIND _multiArgNames "${currentArg}" multiArgIndex) # ... then this marks the end of the arguments belonging to this keyword
if(${optionIndex} EQUAL -1 AND ${singleArgIndex} EQUAL -1 AND ${multiArgIndex} EQUAL -1)
if(insideValues)
if("${insideValues}" STREQUAL "SINGLE")
set(${prefix}_${currentArgName} ${currentArg})
set(insideValues FALSE)
elseif("${insideValues}" STREQUAL "MULTI")
list(APPEND ${prefix}_${currentArgName} ${currentArg})
endif()
else(insideValues)
list(APPEND ${prefix}_UNPARSED_ARGUMENTS ${currentArg})
endif(insideValues)
else()
if(NOT ${optionIndex} EQUAL -1)
set(${prefix}_${currentArg} TRUE)
set(insideValues FALSE)
elseif(NOT ${singleArgIndex} EQUAL -1)
set(currentArgName ${currentArg})
set(${prefix}_${currentArgName})
set(insideValues "SINGLE")
elseif(NOT ${multiArgIndex} EQUAL -1)
set(currentArgName ${currentArg})
set(${prefix}_${currentArgName})
set(insideValues "MULTI")
endif()
endif()
endforeach(currentArg)
# propagate the result variables to the caller:
foreach(arg_name ${_singleArgNames} ${_multiArgNames} ${_optionNames})
set(${prefix}_${arg_name} ${${prefix}_${arg_name}} PARENT_SCOPE)
endforeach(arg_name)
set(${prefix}_UNPARSED_ARGUMENTS ${${prefix}_UNPARSED_ARGUMENTS} PARENT_SCOPE)
endfunction(CMAKE_PARSE_ARGUMENTS _options _singleArgs _multiArgs)

View File

@ -0,0 +1,36 @@
FIND_PACKAGE(PkgConfig)
PKG_CHECK_MODULES(PC_ORC "orc-0.4 > 0.4.11")
FIND_PROGRAM(ORCC_EXECUTABLE orcc
HINTS ${PC_ORC_TOOLSDIR}
PATHS ${ORC_ROOT}/bin ${CMAKE_INSTALL_PREFIX}/bin)
FIND_PATH(ORC_INCLUDE_DIR NAMES orc/orc.h
HINTS ${PC_ORC_INCLUDEDIR}
PATHS ${ORC_ROOT}/include/orc-0.4 ${CMAKE_INSTALL_PREFIX}/include/orc-0.4)
FIND_PATH(ORC_LIBRARY_DIR NAMES ${CMAKE_SHARED_LIBRARY_PREFIX}orc-0.4${CMAKE_SHARED_LIBRARY_SUFFIX}
HINTS ${PC_ORC_LIBDIR}
PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
FIND_LIBRARY(ORC_LIB orc-0.4
HINTS ${PC_ORC_LIBRARY_DIRS}
PATHS ${ORC_ROOT}/lib${LIB_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX})
LIST(APPEND ORC_LIBRARY
${ORC_LIB}
)
SET(ORC_INCLUDE_DIRS ${ORC_INCLUDE_DIR})
SET(ORC_LIBRARIES ${ORC_LIBRARY})
SET(ORC_LIBRARY_DIRS ${ORC_LIBRARY_DIR})
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(ORC "orc files" ORC_LIBRARY ORC_INCLUDE_DIR ORCC_EXECUTABLE)
mark_as_advanced(ORC_INCLUDE_DIR ORC_LIBRARY ORCC_EXECUTABLE)

View File

@ -0,0 +1,234 @@
# Copyright 2010-2011,2013 Free Software Foundation, Inc.
#
# This file is part of GNU Radio
#
# GNU Radio is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
#
# GNU Radio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GNU Radio; see the file COPYING. If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street,
# Boston, MA 02110-1301, USA.
if(DEFINED __INCLUDED_VOLK_PYTHON_CMAKE)
return()
endif()
set(__INCLUDED_VOLK_PYTHON_CMAKE TRUE)
########################################################################
# Setup the python interpreter:
# This allows the user to specify a specific interpreter,
# or finds the interpreter via the built-in cmake module.
########################################################################
#this allows the user to override PYTHON_EXECUTABLE
if(PYTHON_EXECUTABLE)
set(PYTHONINTERP_FOUND TRUE)
#otherwise if not set, try to automatically find it
else(PYTHON_EXECUTABLE)
#use the built-in find script
find_package(PythonInterp 2)
#and if that fails use the find program routine
if(NOT PYTHONINTERP_FOUND)
find_program(PYTHON_EXECUTABLE NAMES python python2 python2.7 python2.6 python2.5)
if(PYTHON_EXECUTABLE)
set(PYTHONINTERP_FOUND TRUE)
endif(PYTHON_EXECUTABLE)
endif(NOT PYTHONINTERP_FOUND)
endif(PYTHON_EXECUTABLE)
#make the path to the executable appear in the cmake gui
set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE FILEPATH "python interpreter")
#make sure we can use -B with python (introduced in 2.6)
if(PYTHON_EXECUTABLE)
execute_process(
COMMAND ${PYTHON_EXECUTABLE} -B -c ""
OUTPUT_QUIET ERROR_QUIET
RESULT_VARIABLE PYTHON_HAS_DASH_B_RESULT
)
if(PYTHON_HAS_DASH_B_RESULT EQUAL 0)
set(PYTHON_DASH_B "-B")
endif()
endif(PYTHON_EXECUTABLE)
########################################################################
# Check for the existence of a python module:
# - desc a string description of the check
# - mod the name of the module to import
# - cmd an additional command to run
# - have the result variable to set
########################################################################
macro(VOLK_PYTHON_CHECK_MODULE desc mod cmd have)
message(STATUS "")
message(STATUS "Python checking for ${desc}")
execute_process(
COMMAND ${PYTHON_EXECUTABLE} -c "
#########################################
try: import ${mod}
except:
try: ${mod}
except: exit(-1)
try: assert ${cmd}
except: exit(-1)
#########################################"
RESULT_VARIABLE ${have}
)
if(${have} EQUAL 0)
message(STATUS "Python checking for ${desc} - found")
set(${have} TRUE)
else(${have} EQUAL 0)
message(STATUS "Python checking for ${desc} - not found")
set(${have} FALSE)
endif(${have} EQUAL 0)
endmacro(VOLK_PYTHON_CHECK_MODULE)
########################################################################
# Sets the python installation directory VOLK_PYTHON_DIR
########################################################################
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "
from distutils import sysconfig
print sysconfig.get_python_lib(plat_specific=True, prefix='')
" OUTPUT_VARIABLE VOLK_PYTHON_DIR OUTPUT_STRIP_TRAILING_WHITESPACE
)
file(TO_CMAKE_PATH ${VOLK_PYTHON_DIR} VOLK_PYTHON_DIR)
########################################################################
# Create an always-built target with a unique name
# Usage: VOLK_UNIQUE_TARGET(<description> <dependencies list>)
########################################################################
function(VOLK_UNIQUE_TARGET desc)
file(RELATIVE_PATH reldir ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import re, hashlib
unique = hashlib.md5('${reldir}${ARGN}').hexdigest()[:5]
print(re.sub('\\W', '_', '${desc} ${reldir} ' + unique))"
OUTPUT_VARIABLE _target OUTPUT_STRIP_TRAILING_WHITESPACE)
add_custom_target(${_target} ALL DEPENDS ${ARGN})
endfunction(VOLK_UNIQUE_TARGET)
########################################################################
# Install python sources (also builds and installs byte-compiled python)
########################################################################
function(VOLK_PYTHON_INSTALL)
include(CMakeParseArgumentsCopy)
CMAKE_PARSE_ARGUMENTS(VOLK_PYTHON_INSTALL "" "DESTINATION;COMPONENT" "FILES;PROGRAMS" ${ARGN})
####################################################################
if(VOLK_PYTHON_INSTALL_FILES)
####################################################################
install(${ARGN}) #installs regular python files
#create a list of all generated files
unset(pysrcfiles)
unset(pycfiles)
unset(pyofiles)
foreach(pyfile ${VOLK_PYTHON_INSTALL_FILES})
get_filename_component(pyfile ${pyfile} ABSOLUTE)
list(APPEND pysrcfiles ${pyfile})
#determine if this file is in the source or binary directory
file(RELATIVE_PATH source_rel_path ${CMAKE_CURRENT_SOURCE_DIR} ${pyfile})
string(LENGTH "${source_rel_path}" source_rel_path_len)
file(RELATIVE_PATH binary_rel_path ${CMAKE_CURRENT_BINARY_DIR} ${pyfile})
string(LENGTH "${binary_rel_path}" binary_rel_path_len)
#and set the generated path appropriately
if(${source_rel_path_len} GREATER ${binary_rel_path_len})
set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${binary_rel_path})
else()
set(pygenfile ${CMAKE_CURRENT_BINARY_DIR}/${source_rel_path})
endif()
list(APPEND pycfiles ${pygenfile}c)
list(APPEND pyofiles ${pygenfile}o)
#ensure generation path exists
get_filename_component(pygen_path ${pygenfile} PATH)
file(MAKE_DIRECTORY ${pygen_path})
endforeach(pyfile)
#the command to generate the pyc files
add_custom_command(
DEPENDS ${pysrcfiles} OUTPUT ${pycfiles}
COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pycfiles}
)
#the command to generate the pyo files
add_custom_command(
DEPENDS ${pysrcfiles} OUTPUT ${pyofiles}
COMMAND ${PYTHON_EXECUTABLE} -O ${CMAKE_BINARY_DIR}/python_compile_helper.py ${pysrcfiles} ${pyofiles}
)
#create install rule and add generated files to target list
set(python_install_gen_targets ${pycfiles} ${pyofiles})
install(FILES ${python_install_gen_targets}
DESTINATION ${VOLK_PYTHON_INSTALL_DESTINATION}
COMPONENT ${VOLK_PYTHON_INSTALL_COMPONENT}
)
####################################################################
elseif(VOLK_PYTHON_INSTALL_PROGRAMS)
####################################################################
file(TO_NATIVE_PATH ${PYTHON_EXECUTABLE} pyexe_native)
if (CMAKE_CROSSCOMPILING)
set(pyexe_native "/usr/bin/env python")
endif()
foreach(pyfile ${VOLK_PYTHON_INSTALL_PROGRAMS})
get_filename_component(pyfile_name ${pyfile} NAME)
get_filename_component(pyfile ${pyfile} ABSOLUTE)
string(REPLACE "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}" pyexefile "${pyfile}.exe")
list(APPEND python_install_gen_targets ${pyexefile})
get_filename_component(pyexefile_path ${pyexefile} PATH)
file(MAKE_DIRECTORY ${pyexefile_path})
add_custom_command(
OUTPUT ${pyexefile} DEPENDS ${pyfile}
COMMAND ${PYTHON_EXECUTABLE} -c
"open('${pyexefile}','w').write('\#!${pyexe_native}\\n'+open('${pyfile}').read())"
COMMENT "Shebangin ${pyfile_name}"
VERBATIM
)
#on windows, python files need an extension to execute
get_filename_component(pyfile_ext ${pyfile} EXT)
if(WIN32 AND NOT pyfile_ext)
set(pyfile_name "${pyfile_name}.py")
endif()
install(PROGRAMS ${pyexefile} RENAME ${pyfile_name}
DESTINATION ${VOLK_PYTHON_INSTALL_DESTINATION}
COMPONENT ${VOLK_PYTHON_INSTALL_COMPONENT}
)
endforeach(pyfile)
endif()
VOLK_UNIQUE_TARGET("pygen" ${python_install_gen_targets})
endfunction(VOLK_PYTHON_INSTALL)
########################################################################
# Write the python helper script that generates byte code files
########################################################################
file(WRITE ${CMAKE_BINARY_DIR}/python_compile_helper.py "
import sys, py_compile
files = sys.argv[1:]
srcs, gens = files[:len(files)/2], files[len(files)/2:]
for src, gen in zip(srcs, gens):
py_compile.compile(file=src, cfile=gen, doraise=True)
")

View File

@ -0,0 +1,98 @@
# Copyright 2010-2011 Free Software Foundation, Inc.
#
# This file is part of GNU Radio
#
# GNU Radio is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
#
# GNU Radio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GNU Radio; see the file COPYING. If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street,
# Boston, MA 02110-1301, USA.
if(DEFINED __INCLUDED_VOLK_BOOST_CMAKE)
return()
endif()
set(__INCLUDED_VOLK_BOOST_CMAKE TRUE)
########################################################################
# Setup Boost and handle some system specific things
########################################################################
set(BOOST_REQUIRED_COMPONENTS
filesystem
system
unit_test_framework
program_options
)
if(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64")
list(APPEND BOOST_LIBRARYDIR "/usr/lib64") #fedora 64-bit fix
endif(UNIX AND NOT BOOST_ROOT AND EXISTS "/usr/lib64")
if(MSVC)
set(BOOST_REQUIRED_COMPONENTS ${BOOST_REQUIRED_COMPONENTS} chrono)
if (NOT DEFINED BOOST_ALL_DYN_LINK)
set(BOOST_ALL_DYN_LINK TRUE)
endif()
set(BOOST_ALL_DYN_LINK "${BOOST_ALL_DYN_LINK}" CACHE BOOL "boost enable dynamic linking")
if(BOOST_ALL_DYN_LINK)
add_definitions(-DBOOST_ALL_DYN_LINK) #setup boost auto-linking in msvc
else(BOOST_ALL_DYN_LINK)
unset(BOOST_REQUIRED_COMPONENTS) #empty components list for static link
endif(BOOST_ALL_DYN_LINK)
endif(MSVC)
find_package(Boost "1.35" COMPONENTS ${BOOST_REQUIRED_COMPONENTS})
# This does not allow us to disable specific versions. It is used
# internally by cmake to know the formation newer versions. As newer
# Boost version beyond what is shown here are produced, we must extend
# this list. To disable Boost versions, see below.
set(Boost_ADDITIONAL_VERSIONS
"1.35.0" "1.35" "1.36.0" "1.36" "1.37.0" "1.37" "1.38.0" "1.38" "1.39.0" "1.39"
"1.40.0" "1.40" "1.41.0" "1.41" "1.42.0" "1.42" "1.43.0" "1.43" "1.44.0" "1.44"
"1.45.0" "1.45" "1.46.0" "1.46" "1.47.0" "1.47" "1.48.0" "1.48" "1.49.0" "1.49"
"1.50.0" "1.50" "1.51.0" "1.51" "1.52.0" "1.52" "1.53.0" "1.53" "1.54.0" "1.54"
"1.55.0" "1.55" "1.56.0" "1.56" "1.57.0" "1.57" "1.58.0" "1.58" "1.59.0" "1.59"
"1.60.0" "1.60" "1.61.0" "1.61" "1.62.0" "1.62" "1.63.0" "1.63" "1.64.0" "1.64"
"1.65.0" "1.65" "1.66.0" "1.66" "1.67.0" "1.67" "1.68.0" "1.68" "1.69.0" "1.69"
)
# Boost 1.52 disabled, see https://svn.boost.org/trac/boost/ticket/7669
# Similar problems with Boost 1.46 and 1.47.
OPTION(ENABLE_BAD_BOOST "Enable known bad versions of Boost" OFF)
if(ENABLE_BAD_BOOST)
MESSAGE(STATUS "Enabling use of known bad versions of Boost.")
endif(ENABLE_BAD_BOOST)
# For any unsuitable Boost version, add the version number below in
# the following format: XXYYZZ
# Where:
# XX is the major version ('10' for version 1)
# YY is the minor version number ('46' for 1.46)
# ZZ is the patcher version number (typically just '00')
set(Boost_NOGO_VERSIONS
104600 104601 104700 105200
)
foreach(ver ${Boost_NOGO_VERSIONS})
if(${Boost_VERSION} EQUAL ${ver})
if(NOT ENABLE_BAD_BOOST)
MESSAGE(STATUS "WARNING: Found a known bad version of Boost (v${Boost_VERSION}). Disabling.")
set(Boost_FOUND FALSE)
else(NOT ENABLE_BAD_BOOST)
MESSAGE(STATUS "WARNING: Found a known bad version of Boost (v${Boost_VERSION}). Continuing anyway.")
set(Boost_FOUND TRUE)
endif(NOT ENABLE_BAD_BOOST)
endif(${Boost_VERSION} EQUAL ${ver})
endforeach(ver)

View File

@ -0,0 +1,26 @@
INCLUDE(FindPkgConfig)
PKG_CHECK_MODULES(PC_VOLK volk_gnsssdr)
FIND_PATH(
VOLK_INCLUDE_DIRS
NAMES volk_gnsssdr/volk_gnsssdr.h
HINTS $ENV{VOLK_DIR}/include
${PC_VOLK_INCLUDEDIR}
PATHS /usr/local/include
/usr/include
)
FIND_LIBRARY(
VOLK_LIBRARIES
NAMES volk_gnsssdr
HINTS $ENV{VOLK_DIR}/lib
${PC_VOLK_LIBDIR}
PATHS /usr/local/lib
/usr/local/lib64
/usr/lib
/usr/lib64
)
INCLUDE(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS)

View File

@ -0,0 +1,58 @@
#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]
#ifndef _MSC_CONFIG_H_ // [
#define _MSC_CONFIG_H_
////////////////////////////////////////////////////////////////////////
// enable inline functions for C code
////////////////////////////////////////////////////////////////////////
#ifndef __cplusplus
# define inline __inline
#endif
////////////////////////////////////////////////////////////////////////
// signed size_t
////////////////////////////////////////////////////////////////////////
#include <stddef.h>
typedef ptrdiff_t ssize_t;
////////////////////////////////////////////////////////////////////////
// rint functions
////////////////////////////////////////////////////////////////////////
#include <math.h>
static inline long lrint(double x){return (long)(x > 0.0 ? x + 0.5 : x - 0.5);}
static inline long lrintf(float x){return (long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
static inline long long llrint(double x){return (long long)(x > 0.0 ? x + 0.5 : x - 0.5);}
static inline long long llrintf(float x){return (long long)(x > 0.0f ? x + 0.5f : x - 0.5f);}
static inline double rint(double x){return (x > 0.0)? floor(x + 0.5) : ceil(x - 0.5);}
static inline float rintf(float x){return (x > 0.0f)? floorf(x + 0.5f) : ceilf(x - 0.5f);}
////////////////////////////////////////////////////////////////////////
// math constants
////////////////////////////////////////////////////////////////////////
#define INFINITY HUGE_VAL
# define M_E 2.7182818284590452354 /* e */
# define M_LOG2E 1.4426950408889634074 /* log_2 e */
# define M_LOG10E 0.43429448190325182765 /* log_10 e */
# define M_LN2 0.69314718055994530942 /* log_e 2 */
# define M_LN10 2.30258509299404568402 /* log_e 10 */
# define M_PI 3.14159265358979323846 /* pi */
# define M_PI_2 1.57079632679489661923 /* pi/2 */
# define M_PI_4 0.78539816339744830962 /* pi/4 */
# define M_1_PI 0.31830988618379067154 /* 1/pi */
# define M_2_PI 0.63661977236758134308 /* 2/pi */
# define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */
# define M_SQRT2 1.41421356237309504880 /* sqrt(2) */
# define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
////////////////////////////////////////////////////////////////////////
// random and srandom
////////////////////////////////////////////////////////////////////////
#include <stdlib.h>
static inline long int random (void) { return rand(); }
static inline void srandom (unsigned int seed) { srand(seed); }
#endif // _MSC_CONFIG_H_ ]

View File

@ -0,0 +1,301 @@
// ISO C9x compliant inttypes.h for Microsoft Visual Studio
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
//
// Copyright (c) 2006 Alexander Chemeris
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. The name of the author may be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]
#ifndef _MSC_INTTYPES_H_ // [
#define _MSC_INTTYPES_H_
#if _MSC_VER > 1000
#pragma once
#endif
#include <stdint.h>
// 7.8 Format conversion of integer types
typedef struct {
intmax_t quot;
intmax_t rem;
} imaxdiv_t;
// 7.8.1 Macros for format specifiers
// The fprintf macros for signed integers are:
#define PRId8 "d"
#define PRIi8 "i"
#define PRIdLEAST8 "d"
#define PRIiLEAST8 "i"
#define PRIdFAST8 "d"
#define PRIiFAST8 "i"
#define PRId16 "hd"
#define PRIi16 "hi"
#define PRIdLEAST16 "hd"
#define PRIiLEAST16 "hi"
#define PRIdFAST16 "hd"
#define PRIiFAST16 "hi"
#define PRId32 "I32d"
#define PRIi32 "I32i"
#define PRIdLEAST32 "I32d"
#define PRIiLEAST32 "I32i"
#define PRIdFAST32 "I32d"
#define PRIiFAST32 "I32i"
#define PRId64 "I64d"
#define PRIi64 "I64i"
#define PRIdLEAST64 "I64d"
#define PRIiLEAST64 "I64i"
#define PRIdFAST64 "I64d"
#define PRIiFAST64 "I64i"
#define PRIdMAX "I64d"
#define PRIiMAX "I64i"
#define PRIdPTR "Id"
#define PRIiPTR "Ii"
// The fprintf macros for unsigned integers are:
#define PRIo8 "o"
#define PRIu8 "u"
#define PRIx8 "x"
#define PRIX8 "X"
#define PRIoLEAST8 "o"
#define PRIuLEAST8 "u"
#define PRIxLEAST8 "x"
#define PRIXLEAST8 "X"
#define PRIoFAST8 "o"
#define PRIuFAST8 "u"
#define PRIxFAST8 "x"
#define PRIXFAST8 "X"
#define PRIo16 "ho"
#define PRIu16 "hu"
#define PRIx16 "hx"
#define PRIX16 "hX"
#define PRIoLEAST16 "ho"
#define PRIuLEAST16 "hu"
#define PRIxLEAST16 "hx"
#define PRIXLEAST16 "hX"
#define PRIoFAST16 "ho"
#define PRIuFAST16 "hu"
#define PRIxFAST16 "hx"
#define PRIXFAST16 "hX"
#define PRIo32 "I32o"
#define PRIu32 "I32u"
#define PRIx32 "I32x"
#define PRIX32 "I32X"
#define PRIoLEAST32 "I32o"
#define PRIuLEAST32 "I32u"
#define PRIxLEAST32 "I32x"
#define PRIXLEAST32 "I32X"
#define PRIoFAST32 "I32o"
#define PRIuFAST32 "I32u"
#define PRIxFAST32 "I32x"
#define PRIXFAST32 "I32X"
#define PRIo64 "I64o"
#define PRIu64 "I64u"
#define PRIx64 "I64x"
#define PRIX64 "I64X"
#define PRIoLEAST64 "I64o"
#define PRIuLEAST64 "I64u"
#define PRIxLEAST64 "I64x"
#define PRIXLEAST64 "I64X"
#define PRIoFAST64 "I64o"
#define PRIuFAST64 "I64u"
#define PRIxFAST64 "I64x"
#define PRIXFAST64 "I64X"
#define PRIoMAX "I64o"
#define PRIuMAX "I64u"
#define PRIxMAX "I64x"
#define PRIXMAX "I64X"
#define PRIoPTR "Io"
#define PRIuPTR "Iu"
#define PRIxPTR "Ix"
#define PRIXPTR "IX"
// The fscanf macros for signed integers are:
#define SCNd8 "d"
#define SCNi8 "i"
#define SCNdLEAST8 "d"
#define SCNiLEAST8 "i"
#define SCNdFAST8 "d"
#define SCNiFAST8 "i"
#define SCNd16 "hd"
#define SCNi16 "hi"
#define SCNdLEAST16 "hd"
#define SCNiLEAST16 "hi"
#define SCNdFAST16 "hd"
#define SCNiFAST16 "hi"
#define SCNd32 "ld"
#define SCNi32 "li"
#define SCNdLEAST32 "ld"
#define SCNiLEAST32 "li"
#define SCNdFAST32 "ld"
#define SCNiFAST32 "li"
#define SCNd64 "I64d"
#define SCNi64 "I64i"
#define SCNdLEAST64 "I64d"
#define SCNiLEAST64 "I64i"
#define SCNdFAST64 "I64d"
#define SCNiFAST64 "I64i"
#define SCNdMAX "I64d"
#define SCNiMAX "I64i"
#ifdef _WIN64 // [
# define SCNdPTR "I64d"
# define SCNiPTR "I64i"
#else // _WIN64 ][
# define SCNdPTR "ld"
# define SCNiPTR "li"
#endif // _WIN64 ]
// The fscanf macros for unsigned integers are:
#define SCNo8 "o"
#define SCNu8 "u"
#define SCNx8 "x"
#define SCNX8 "X"
#define SCNoLEAST8 "o"
#define SCNuLEAST8 "u"
#define SCNxLEAST8 "x"
#define SCNXLEAST8 "X"
#define SCNoFAST8 "o"
#define SCNuFAST8 "u"
#define SCNxFAST8 "x"
#define SCNXFAST8 "X"
#define SCNo16 "ho"
#define SCNu16 "hu"
#define SCNx16 "hx"
#define SCNX16 "hX"
#define SCNoLEAST16 "ho"
#define SCNuLEAST16 "hu"
#define SCNxLEAST16 "hx"
#define SCNXLEAST16 "hX"
#define SCNoFAST16 "ho"
#define SCNuFAST16 "hu"
#define SCNxFAST16 "hx"
#define SCNXFAST16 "hX"
#define SCNo32 "lo"
#define SCNu32 "lu"
#define SCNx32 "lx"
#define SCNX32 "lX"
#define SCNoLEAST32 "lo"
#define SCNuLEAST32 "lu"
#define SCNxLEAST32 "lx"
#define SCNXLEAST32 "lX"
#define SCNoFAST32 "lo"
#define SCNuFAST32 "lu"
#define SCNxFAST32 "lx"
#define SCNXFAST32 "lX"
#define SCNo64 "I64o"
#define SCNu64 "I64u"
#define SCNx64 "I64x"
#define SCNX64 "I64X"
#define SCNoLEAST64 "I64o"
#define SCNuLEAST64 "I64u"
#define SCNxLEAST64 "I64x"
#define SCNXLEAST64 "I64X"
#define SCNoFAST64 "I64o"
#define SCNuFAST64 "I64u"
#define SCNxFAST64 "I64x"
#define SCNXFAST64 "I64X"
#define SCNoMAX "I64o"
#define SCNuMAX "I64u"
#define SCNxMAX "I64x"
#define SCNXMAX "I64X"
#ifdef _WIN64 // [
# define SCNoPTR "I64o"
# define SCNuPTR "I64u"
# define SCNxPTR "I64x"
# define SCNXPTR "I64X"
#else // _WIN64 ][
# define SCNoPTR "lo"
# define SCNuPTR "lu"
# define SCNxPTR "lx"
# define SCNXPTR "lX"
#endif // _WIN64 ]
// 7.8.2 Functions for greatest-width integer types
// 7.8.2.1 The imaxabs function
#define imaxabs _abs64
// 7.8.2.2 The imaxdiv function
// This is modified version of div() function from Microsoft's div.c found
// in %MSVC.NET%\crt\src\div.c
#ifdef STATIC_IMAXDIV // [
static
#else // STATIC_IMAXDIV ][
_inline
#endif // STATIC_IMAXDIV ]
imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom)
{
imaxdiv_t result;
result.quot = numer / denom;
result.rem = numer % denom;
if (numer < 0 && result.rem > 0) {
// did division wrong; must fix up
++result.quot;
result.rem -= denom;
}
return result;
}
// 7.8.2.3 The strtoimax and strtoumax functions
#define strtoimax _strtoi64
#define strtoumax _strtoui64
// 7.8.2.4 The wcstoimax and wcstoumax functions
#define wcstoimax _wcstoi64
#define wcstoumax _wcstoui64
#endif // _MSC_INTTYPES_H_ ]

View File

@ -0,0 +1,45 @@
/*
* Copyright (C) 2005, 2006 Apple Computer, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*
*/
#ifndef STDBOOL_WIN32_H
#define STDBOOL_WIN32_H
#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]
#ifndef __cplusplus
typedef unsigned char bool;
#define true 1
#define false 0
#ifndef CASSERT
#define CASSERT(exp, name) typedef int dummy##name [(exp) ? 1 : -1];
#endif
CASSERT(sizeof(bool) == 1, bool_is_one_byte)
CASSERT(true, true_is_true)
CASSERT(!false, false_is_false)
#endif
#endif

View File

@ -0,0 +1,251 @@
// ISO C9x compliant stdint.h for Microsoft Visual Studio
// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124
//
// Copyright (c) 2006-2008 Alexander Chemeris
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. The name of the author may be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////////
#ifndef _MSC_VER // [
#error "Use this header only with Microsoft Visual C++ compilers!"
#endif // _MSC_VER ]
#ifndef _MSC_STDINT_H_ // [
#define _MSC_STDINT_H_
#if _MSC_VER > 1000
#pragma once
#endif
#include <limits.h>
// For Visual Studio 6 in C++ mode and for many Visual Studio versions when
// compiling for ARM we should wrap <wchar.h> include with 'extern "C++" {}'
// or compiler give many errors like this:
// error C2733: second C linkage of overloaded function 'wmemchr' not allowed
#ifdef __cplusplus
extern "C" {
#endif
# include <wchar.h>
#ifdef __cplusplus
}
#endif
// Define _W64 macros to mark types changing their size, like intptr_t.
#ifndef _W64
# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300
# define _W64 __w64
# else
# define _W64
# endif
#endif
// 7.18.1 Integer types
// 7.18.1.1 Exact-width integer types
// Visual Studio 6 and Embedded Visual C++ 4 doesn't
// realize that, e.g. char has the same size as __int8
// so we give up on __intX for them.
#if (_MSC_VER < 1300)
typedef signed char int8_t;
typedef signed short int16_t;
typedef signed int int32_t;
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
#else
typedef signed __int8 int8_t;
typedef signed __int16 int16_t;
typedef signed __int32 int32_t;
typedef unsigned __int8 uint8_t;
typedef unsigned __int16 uint16_t;
typedef unsigned __int32 uint32_t;
#endif
typedef signed __int64 int64_t;
typedef unsigned __int64 uint64_t;
// 7.18.1.2 Minimum-width integer types
typedef int8_t int_least8_t;
typedef int16_t int_least16_t;
typedef int32_t int_least32_t;
typedef int64_t int_least64_t;
typedef uint8_t uint_least8_t;
typedef uint16_t uint_least16_t;
typedef uint32_t uint_least32_t;
typedef uint64_t uint_least64_t;
// 7.18.1.3 Fastest minimum-width integer types
typedef int8_t int_fast8_t;
typedef int16_t int_fast16_t;
typedef int32_t int_fast32_t;
typedef int64_t int_fast64_t;
typedef uint8_t uint_fast8_t;
typedef uint16_t uint_fast16_t;
typedef uint32_t uint_fast32_t;
typedef uint64_t uint_fast64_t;
// 7.18.1.4 Integer types capable of holding object pointers
#ifdef _WIN64 // [
typedef signed __int64 intptr_t;
typedef unsigned __int64 uintptr_t;
#else // _WIN64 ][
typedef _W64 signed int intptr_t;
typedef _W64 unsigned int uintptr_t;
#endif // _WIN64 ]
// 7.18.1.5 Greatest-width integer types
typedef int64_t intmax_t;
typedef uint64_t uintmax_t;
// 7.18.2 Limits of specified-width integer types
#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259
// 7.18.2.1 Limits of exact-width integer types
#define INT8_MIN ((int8_t)_I8_MIN)
#define INT8_MAX _I8_MAX
#define INT16_MIN ((int16_t)_I16_MIN)
#define INT16_MAX _I16_MAX
#define INT32_MIN ((int32_t)_I32_MIN)
#define INT32_MAX _I32_MAX
#define INT64_MIN ((int64_t)_I64_MIN)
#define INT64_MAX _I64_MAX
#define UINT8_MAX _UI8_MAX
#define UINT16_MAX _UI16_MAX
#define UINT32_MAX _UI32_MAX
#define UINT64_MAX _UI64_MAX
// 7.18.2.2 Limits of minimum-width integer types
#define INT_LEAST8_MIN INT8_MIN
#define INT_LEAST8_MAX INT8_MAX
#define INT_LEAST16_MIN INT16_MIN
#define INT_LEAST16_MAX INT16_MAX
#define INT_LEAST32_MIN INT32_MIN
#define INT_LEAST32_MAX INT32_MAX
#define INT_LEAST64_MIN INT64_MIN
#define INT_LEAST64_MAX INT64_MAX
#define UINT_LEAST8_MAX UINT8_MAX
#define UINT_LEAST16_MAX UINT16_MAX
#define UINT_LEAST32_MAX UINT32_MAX
#define UINT_LEAST64_MAX UINT64_MAX
// 7.18.2.3 Limits of fastest minimum-width integer types
#define INT_FAST8_MIN INT8_MIN
#define INT_FAST8_MAX INT8_MAX
#define INT_FAST16_MIN INT16_MIN
#define INT_FAST16_MAX INT16_MAX
#define INT_FAST32_MIN INT32_MIN
#define INT_FAST32_MAX INT32_MAX
#define INT_FAST64_MIN INT64_MIN
#define INT_FAST64_MAX INT64_MAX
#define UINT_FAST8_MAX UINT8_MAX
#define UINT_FAST16_MAX UINT16_MAX
#define UINT_FAST32_MAX UINT32_MAX
#define UINT_FAST64_MAX UINT64_MAX
// 7.18.2.4 Limits of integer types capable of holding object pointers
#ifdef _WIN64 // [
# define INTPTR_MIN INT64_MIN
# define INTPTR_MAX INT64_MAX
# define UINTPTR_MAX UINT64_MAX
#else // _WIN64 ][
# define INTPTR_MIN INT32_MIN
# define INTPTR_MAX INT32_MAX
# define UINTPTR_MAX UINT32_MAX
#endif // _WIN64 ]
// 7.18.2.5 Limits of greatest-width integer types
#define INTMAX_MIN INT64_MIN
#define INTMAX_MAX INT64_MAX
#define UINTMAX_MAX UINT64_MAX
// 7.18.3 Limits of other integer types
#ifdef _WIN64 // [
# define PTRDIFF_MIN _I64_MIN
# define PTRDIFF_MAX _I64_MAX
#else // _WIN64 ][
# define PTRDIFF_MIN _I32_MIN
# define PTRDIFF_MAX _I32_MAX
#endif // _WIN64 ]
#define SIG_ATOMIC_MIN INT_MIN
#define SIG_ATOMIC_MAX INT_MAX
#ifndef SIZE_MAX // [
# ifdef _WIN64 // [
# define SIZE_MAX _UI64_MAX
# else // _WIN64 ][
# define SIZE_MAX _UI32_MAX
# endif // _WIN64 ]
#endif // SIZE_MAX ]
// WCHAR_MIN and WCHAR_MAX are also defined in <wchar.h>
#ifndef WCHAR_MIN // [
# define WCHAR_MIN 0
#endif // WCHAR_MIN ]
#ifndef WCHAR_MAX // [
# define WCHAR_MAX _UI16_MAX
#endif // WCHAR_MAX ]
#define WINT_MIN 0
#define WINT_MAX _UI16_MAX
#endif // __STDC_LIMIT_MACROS ]
// 7.18.4 Limits of other integer types
#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260
// 7.18.4.1 Macros for minimum-width integer constants
#define INT8_C(val) val##i8
#define INT16_C(val) val##i16
#define INT32_C(val) val##i32
#define INT64_C(val) val##i64
#define UINT8_C(val) val##ui8
#define UINT16_C(val) val##ui16
#define UINT32_C(val) val##ui32
#define UINT64_C(val) val##ui64
// 7.18.4.2 Macros for greatest-width integer constants
#ifndef INTMAX_C
#define INTMAX_C INT64_C
#endif
#ifndef UINTMAX_C
#define UINTMAX_C UINT64_C
#endif
#endif // __STDC_CONSTANT_MACROS ]
#endif // _MSC_STDINT_H_ ]

View File

@ -0,0 +1,204 @@
<!-- archs appear in order of significance for blind, de-facto version ordering -->
<grammar>
<arch name="generic"> <!-- name is required-->
</arch>
<arch name="altivec">
<flag compiler="gnu">-maltivec</flag>
<alignment>16</alignment>
<check name="has_ppc"></check>
</arch>
<arch name="softfp">
<flag compiler="gnu">-mfloat-abi=softfp</flag>
</arch>
<arch name="hardfp">
<flag compiler="gnu">-mfloat-abi=hard</flag>
</arch>
<arch name="neon">
<flag compiler="gnu">-mfpu=neon</flag>
<flag compiler="gnu">-funsafe-math-optimizations</flag>
<alignment>16</alignment>
<check name="has_neon"></check>
</arch>
<arch name="32">
<flag compiler="gnu">-m32</flag>
</arch>
<arch name="64">
<check name="check_extended_cpuid">
<param>0x80000001</param>
</check>
<check name="cpuid_x86_bit"> <!-- checks to see if a bit is set -->
<param>3</param> <!-- eax, ebx, ecx, [edx] -->
<param>0x80000001</param> <!-- cpuid operation -->
<param>29</param> <!-- bit shift -->
</check>
<flag compiler="gnu">-m64</flag>
<flag compiler="clang">-m64</flag>
</arch>
<arch name="3dnow">
<check name="cpuid_x86_bit">
<param>3</param>
<param>0x80000001</param>
<param>31</param>
</check>
<flag compiler="gnu">-m3dnow</flag>
<flag compiler="clang">-m3dnow</flag>
<alignment>8</alignment>
</arch>
<arch name="abm">
<check name="cpuid_x86_bit">
<param>3</param>
<param>0x80000001</param>
<param>5</param>
</check>
<flag compiler="gnu">-msse4.2</flag>
<flag compiler="clang">-msse4.2</flag>
<alignment>16</alignment>
</arch>
<arch name="popcount">
<check name="cpuid_x86_bit">
<param>2</param>
<param>0x00000001</param>
<param>23</param>
</check>
<flag compiler="gnu">-mpopcnt</flag>
<flag compiler="clang">-mpopcnt</flag>
<flag compiler="msvc">/arch:AVX</flag>
</arch>
<arch name="mmx">
<check name="cpuid_x86_bit">
<param>3</param>
<param>0x00000001</param>
<param>23</param>
</check>
<flag compiler="gnu">-mmmx</flag>
<flag compiler="clang">-mmmx</flag>
<flag compiler="msvc">/arch:SSE</flag>
<alignment>8</alignment>
</arch>
<arch name="sse">
<check name="cpuid_x86_bit">
<param>3</param>
<param>0x00000001</param>
<param>25</param>
</check>
<flag compiler="gnu">-msse</flag>
<flag compiler="clang">-msse</flag>
<flag compiler="msvc">/arch:SSE</flag>
<environment>_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);</environment>
<include>xmmintrin.h</include>
<alignment>16</alignment>
</arch>
<arch name="sse2">
<check name="cpuid_x86_bit">
<param>3</param>
<param>0x00000001</param>
<param>26</param>
</check>
<flag compiler="gnu">-msse2</flag>
<flag compiler="clang">-msse2</flag>
<flag compiler="msvc">/arch:SSE2</flag>
<alignment>16</alignment>
</arch>
<arch name="orc">
</arch>
<!-- it's here for overrule stuff. -->
<arch name="norc">
</arch>
<arch name="sse3">
<check name="cpuid_x86_bit">
<param>2</param>
<param>0x00000001</param>
<param>0</param>
</check>
<flag compiler="gnu">-msse3</flag>
<flag compiler="clang">-msse3</flag>
<flag compiler="msvc">/arch:AVX</flag>
<environment>_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);</environment>
<include>pmmintrin.h</include>
<alignment>16</alignment>
</arch>
<arch name="ssse3">
<check name="cpuid_x86_bit">
<param>2</param>
<param>0x00000001</param>
<param>9</param>
</check>
<flag compiler="gnu">-mssse3</flag>
<flag compiler="clang">-mssse3</flag>
<flag compiler="msvc">/arch:AVX</flag>
<alignment>16</alignment>
</arch>
<arch name="sse4_a">
<check name="cpuid_x86_bit">
<param>2</param>
<param>0x80000001</param>
<param>6</param>
</check>
<flag compiler="gnu">-msse4a</flag>
<flag compiler="clang">-msse4a</flag>
<alignment>16</alignment>
</arch>
<arch name="sse4_1">
<check name="cpuid_x86_bit">
<param>2</param>
<param>0x00000001</param>
<param>19</param>
</check>
<flag compiler="gnu">-msse4.1</flag>
<flag compiler="clang">-msse4.1</flag>
<flag compiler="msvc">/arch:AVX</flag>
<alignment>16</alignment>
</arch>
<arch name="sse4_2">
<check name="cpuid_x86_bit">
<param>2</param>
<param>0x00000001</param>
<param>20</param>
</check>
<flag compiler="gnu">-msse4.2</flag>
<flag compiler="clang">-msse4.2</flag>
<flag compiler="msvc">/arch:AVX</flag>
<alignment>16</alignment>
</arch>
<arch name="avx">
<check name="cpuid_x86_bit">
<param>2</param>
<param>0x00000001</param>
<param>28</param>
</check>
<!-- check to make sure that xgetbv is enabled in OS -->
<check name="cpuid_x86_bit">
<param>2</param>
<param>0x00000001</param>
<param>27</param>
</check>
<!-- check to see that the OS has enabled AVX -->
<check name="get_avx_enabled"></check>
<flag compiler="gnu">-mavx</flag>
<flag compiler="clang">-mavx</flag>
<flag compiler="msvc">/arch:AVX</flag>
<alignment>32</alignment>
</arch>
</grammar>

View File

@ -0,0 +1,55 @@
<grammar>
<machine name="generic">
<archs>generic orc|</archs>
</machine>
<!--
<machine name="mmx">
<archs>generic 32|64 mmx orc|</archs>
</machine>
<machine name="sse">
<archs>generic 32|64| mmx| sse orc|</archs>
</machine>
-->
<machine name="neon">
<archs>generic neon softfp|hardfp orc|</archs>
</machine>
<!-- trailing | bar means generate without either for MSVC -->
<machine name="sse2">
<archs>generic 32|64| mmx| sse sse2 orc|</archs>
</machine>
<machine name="sse3">
<archs>generic 32|64 mmx sse sse2 sse3 orc|</archs>
</machine>
<machine name="ssse3">
<archs>generic 32|64 mmx sse sse2 sse3 ssse3 orc|</archs>
</machine>
<machine name="sse4_a">
<archs>generic 32|64 mmx sse sse2 sse3 sse4_a popcount orc|</archs>
</machine>
<machine name="sse4_1">
<archs>generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 orc|</archs>
</machine>
<machine name="sse4_2">
<archs>generic 32|64 mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount orc|</archs>
</machine>
<!-- trailing | bar means generate without either for MSVC -->
<machine name="avx">
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx orc|</archs>
</machine>
<machine name="altivec">
<archs>generic altivec</archs>
</machine>
</grammar>

View File

@ -0,0 +1,85 @@
#
# Copyright 2012 Free Software Foundation, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
archs = list()
arch_dict = dict()
class arch_class:
def __init__(self, flags, checks, **kwargs):
for key, cast, failval in (
('name', str, None),
('environment', str, None),
('include', str, None),
('alignment', int, 1)
):
try: setattr(self, key, cast(kwargs[key]))
except: setattr(self, key, failval)
self.checks = checks
assert(self.name)
self._flags = flags
def is_supported(self, compiler):
if not self._flags.keys(): return True
return compiler in self._flags.keys()
def get_flags(self, compiler):
try: return self._flags[compiler]
except KeyError: return list()
def __repr__(self): return self.name
def register_arch(**kwargs):
arch = arch_class(**kwargs)
archs.append(arch)
arch_dict[arch.name] = arch
########################################################################
# register the arches
########################################################################
#TODO skip the XML and put it here
from xml.dom import minidom
import os
gendir = os.path.dirname(__file__)
archs_xml = minidom.parse(os.path.join(gendir, 'archs.xml')).getElementsByTagName('arch')
for arch_xml in archs_xml:
kwargs = dict()
for attr in arch_xml.attributes.keys():
kwargs[attr] = arch_xml.attributes[attr].value
for node in arch_xml.childNodes:
try:
name = node.tagName
val = arch_xml.getElementsByTagName(name)[0].firstChild.data
kwargs[name] = val
except: pass
checks = list()
for check_xml in arch_xml.getElementsByTagName("check"):
name = check_xml.attributes["name"].value
params = list()
for param_xml in check_xml.getElementsByTagName("param"):
params.append(param_xml.firstChild.data)
checks.append([name, params])
flags = dict()
for flag_xml in arch_xml.getElementsByTagName("flag"):
name = flag_xml.attributes["compiler"].value
if not flags.has_key(name): flags[name] = list()
flags[name].append(flag_xml.firstChild.data)
#force kwargs keys to be of type str, not unicode for py25
kwargs = dict((str(k), v) for k, v in kwargs.iteritems())
register_arch(flags=flags, checks=checks, **kwargs)
if __name__ == '__main__':
print archs

View File

@ -0,0 +1,58 @@
#!/usr/bin/env python
#
# Copyright 2012 Free Software Foundation, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import optparse
import volk_gnsssdr_arch_defs
import volk_gnsssdr_machine_defs
def do_arch_flags_list(compiler):
output = list()
for arch in volk_gnsssdr_arch_defs.archs:
if not arch.is_supported(compiler): continue
fields = [arch.name] + arch.get_flags(compiler)
output.append(','.join(fields))
print ';'.join(output)
def do_machines_list(arch_names):
output = list()
for machine in volk_gnsssdr_machine_defs.machines:
machine_arch_set = set(machine.arch_names)
if set(arch_names).intersection(machine_arch_set) == machine_arch_set:
output.append(machine.name)
print ';'.join(output)
def do_machine_flags_list(compiler, machine_name):
output = list()
machine = volk_gnsssdr_machine_defs.machine_dict[machine_name]
for arch in machine.archs:
output.extend(arch.get_flags(compiler))
print ' '.join(output)
def main():
parser = optparse.OptionParser()
parser.add_option('--mode', type='string')
parser.add_option('--compiler', type='string')
parser.add_option('--archs', type='string')
parser.add_option('--machine', type='string')
(opts, args) = parser.parse_args()
if opts.mode == 'arch_flags': return do_arch_flags_list(opts.compiler.lower())
if opts.mode == 'machines': return do_machines_list(opts.archs.split(';'))
if opts.mode == 'machine_flags': return do_machine_flags_list(opts.compiler.lower(), opts.machine)
if __name__ == '__main__': main()

View File

@ -0,0 +1,209 @@
#
# Copyright 2011-2012 Free Software Foundation, Inc.
#
# This file is part of GNU Radio
#
# GNU Radio is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
#
# GNU Radio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GNU Radio; see the file COPYING. If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street,
# Boston, MA 02110-1301, USA.
#
import os
import re
import sys
import glob
########################################################################
# Strip comments from a c/cpp file.
# Input is code string, output is code string without comments.
# http://stackoverflow.com/questions/241327/python-snippet-to-remove-c-and-c-comments
########################################################################
def comment_remover(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
return ""
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
########################################################################
# Split code into nested sections according to ifdef preprocessor macros
########################################################################
def split_into_nested_ifdef_sections(code):
sections = list()
section = ''
header = 'text'
in_section_depth = 0
for i, line in enumerate(code.splitlines()):
m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line)
line_is = 'normal'
if m:
p0, p1, fcn, stuff = m.groups()
if fcn in ('if', 'ifndef', 'ifdef'): line_is = 'if'
if fcn in ('else', 'elif'): line_is = 'else'
if fcn in ('endif',): line_is = 'end'
if line_is == 'if': in_section_depth += 1
if line_is == 'end': in_section_depth -= 1
if in_section_depth == 1 and line_is == 'if':
sections.append((header, section))
section = ''
header = line
continue
if in_section_depth == 1 and line_is == 'else':
sections.append((header, section))
section = ''
header = line
continue
if in_section_depth == 0 and line_is == 'end':
sections.append((header, section))
section = ''
header = 'text'
continue
section += line + '\n'
sections.append((header, section)) #and pack remainder into sections
sections = [sec for sec in sections if sec[1].strip()] #filter empty sections
#recurse into non-text sections to fill subsections
for i, (header, section) in enumerate(sections):
if header == 'text': continue
sections[i] = (header, split_into_nested_ifdef_sections(section))
return sections
########################################################################
# Recursive print of sections to test code above
########################################################################
def print_sections(sections, indent = ' '):
for header, body in sections:
if header == 'text':
print indent, ('\n'+indent).join(body.splitlines())
continue
print indent.replace(' ', '-') + '>', header
print_sections(body, indent + ' ')
########################################################################
# Flatten a section to just body text
########################################################################
def flatten_section_text(sections):
output = ''
for hdr, bdy in sections:
if hdr != 'text': output += flatten_section_text(bdy)
else: output += bdy
return output
########################################################################
# Extract kernel info from section, represent as an implementation
########################################################################
class impl_class:
def __init__(self, kern_name, header, body):
#extract LV_HAVE_*
self.deps = set(map(str.lower, re.findall('LV_HAVE_(\w+)', header)))
#extract function suffix and args
body = flatten_section_text(body)
try:
fcn_matcher = re.compile('^.*(%s\\w*)\\s*\\((.*)$'%kern_name, re.DOTALL | re.MULTILINE)
body = body.split('{')[0].rsplit(')', 1)[0] #get the part before the open ){ bracket
m = fcn_matcher.match(body)
impl_name, the_rest = m.groups()
self.name = impl_name.replace(kern_name+'_', '')
self.args = list()
fcn_args = the_rest.split(',')
for fcn_arg in fcn_args:
arg_matcher = re.compile('^\s*(.*\\W)\s*(\w+)\s*$', re.DOTALL | re.MULTILINE)
m = arg_matcher.match(fcn_arg)
arg_type, arg_name = m.groups()
self.args.append((arg_type, arg_name))
except Exception as ex:
raise Exception, 'I cant parse the function prototype from: %s in %s\n%s'%(kern_name, body, ex)
assert self.name
self.is_aligned = self.name.startswith('a_')
def __repr__(self):
return self.name
########################################################################
# Get sets of LV_HAVE_* from the code
########################################################################
def extract_lv_haves(code):
haves = list()
for line in code.splitlines():
if not line.strip().startswith('#'): continue
have_set = set(map(str.lower, re.findall('LV_HAVE_(\w+)', line)))
if have_set: haves.append(have_set)
return haves
########################################################################
# Represent a processing kernel, parse from file
########################################################################
class kernel_class:
def __init__(self, kernel_file):
self.name = os.path.splitext(os.path.basename(kernel_file))[0]
self.pname = self.name.replace('volk_gnsssdr_', 'p_')
code = open(kernel_file, 'r').read()
code = comment_remover(code)
sections = split_into_nested_ifdef_sections(code)
self._impls = list()
for header, section in sections:
if 'ifndef' not in header.lower(): continue
for sub_hdr, body in section:
if 'if' not in sub_hdr.lower(): continue
if 'LV_HAVE_' not in sub_hdr: continue
self._impls.append(impl_class(
kern_name=self.name, header=sub_hdr, body=body,
))
assert(self._impls)
self.has_dispatcher = False
for impl in self._impls:
if impl.name == 'dispatcher':
self._impls.remove(impl)
self.has_dispatcher = True
break
self.args = self._impls[0].args
self.arglist_types = ', '.join([a[0] for a in self.args])
self.arglist_full = ', '.join(['%s %s'%a for a in self.args])
self.arglist_names = ', '.join([a[1] for a in self.args])
def get_impls(self, archs):
archs = set(archs)
impls = list()
for impl in self._impls:
if impl.deps.intersection(archs) == impl.deps:
impls.append(impl)
return impls
def __repr__(self):
return self.name
########################################################################
# Extract information from the VOLK kernels
########################################################################
__file__ = os.path.abspath(__file__)
srcdir = os.path.dirname(os.path.dirname(__file__))
kernel_files = glob.glob(os.path.join(srcdir, "kernels", "volk_gnsssdr", "*.h"))
kernels = map(kernel_class, kernel_files)
if __name__ == '__main__':
print kernels

View File

@ -0,0 +1,74 @@
#
# Copyright 2012 Free Software Foundation, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from volk_gnsssdr_arch_defs import arch_dict
machines = list()
machine_dict = dict()
class machine_class:
def __init__(self, name, archs):
self.name = name
self.archs = list()
self.arch_names = list()
for arch_name in archs:
if not arch_name: continue
arch = arch_dict[arch_name]
self.archs.append(arch)
self.arch_names.append(arch_name)
self.alignment = max(map(lambda a: a.alignment, self.archs))
def __repr__(self): return self.name
def register_machine(name, archs):
for i, arch_name in enumerate(archs):
if '|' in arch_name: #handle special arch names with the '|'
for arch_sub in arch_name.split('|'):
if arch_sub:
register_machine(name+'_'+arch_sub, archs[:i] + [arch_sub] + archs[i+1:])
else:
register_machine(name, archs[:i] + archs[i+1:])
return
machine = machine_class(name=name, archs=archs)
machines.append(machine)
machine_dict[machine.name] = machine
########################################################################
# register the machines
########################################################################
#TODO skip the XML and put it here
from xml.dom import minidom
import os
gendir = os.path.dirname(__file__)
machines_xml = minidom.parse(os.path.join(gendir, 'machines.xml')).getElementsByTagName('machine')
for machine_xml in machines_xml:
kwargs = dict()
for attr in machine_xml.attributes.keys():
kwargs[attr] = machine_xml.attributes[attr].value
for node in machine_xml.childNodes:
try:
name = node.tagName
val = machine_xml.getElementsByTagName(name)[0].firstChild.data
kwargs[name] = val
except: pass
kwargs['archs'] = kwargs['archs'].split()
#force kwargs keys to be of type str, not unicode for py25
kwargs = dict((str(k), v) for k, v in kwargs.iteritems())
register_machine(**kwargs)
if __name__ == '__main__':
print machines

View File

@ -0,0 +1,74 @@
#!/usr/bin/env python
#
# Copyright 2012 Free Software Foundation, Inc.
#
# This file is part of GNU Radio
#
# GNU Radio is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
#
# GNU Radio is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with GNU Radio; see the file COPYING. If not, write to
# the Free Software Foundation, Inc., 51 Franklin Street,
# Boston, MA 02110-1301, USA.
#
import os
import re
import sys
import optparse
import volk_gnsssdr_arch_defs
import volk_gnsssdr_machine_defs
import volk_gnsssdr_kernel_defs
from Cheetah import Template
def __escape_pre_processor(code):
out = list()
for line in code.splitlines():
m = re.match('^(\s*)#(\s*)(\w+)(.*)$', line)
if m:
p0, p1, fcn, stuff = m.groups()
conly = fcn in ('include', 'define', 'ifdef', 'ifndef', 'endif', 'elif', 'pragma')
both = fcn in ('if', 'else')
istmpl = '$' in stuff
if 'defined' in stuff: istmpl = False
if conly or (both and not istmpl):
line = '%s\\#%s%s%s'%(p0, p1, fcn, stuff)
out.append(line)
return '\n'.join(out)
def __parse_tmpl(_tmpl, **kwargs):
defs = {
'archs': volk_gnsssdr_arch_defs.archs,
'arch_dict': volk_gnsssdr_arch_defs.arch_dict,
'machines': volk_gnsssdr_machine_defs.machines,
'machine_dict': volk_gnsssdr_machine_defs.machine_dict,
'kernels': volk_gnsssdr_kernel_defs.kernels,
}
defs.update(kwargs)
_tmpl = __escape_pre_processor(_tmpl)
_tmpl = """
/* this file was generated by volk_gnsssdr template utils, do not edit! */
""" + _tmpl
return str(Template.Template(_tmpl, defs))
def main():
parser = optparse.OptionParser()
parser.add_option('--input', type='string')
parser.add_option('--output', type='string')
(opts, args) = parser.parse_args()
output = __parse_tmpl(open(opts.input).read(), args=args)
if opts.output: open(opts.output, 'w').write(output)
else: print output
if __name__ == '__main__': main()

View File

@ -0,0 +1,39 @@
/* -*- c++ -*- */
/*
* Copyright 2006,2009,2013 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#ifndef INCLUDED_VOLK_CONSTANTS_H
#define INCLUDED_VOLK_CONSTANTS_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
__VOLK_DECL_BEGIN
VOLK_API char* volk_gnsssdr_prefix();
VOLK_API char* volk_gnsssdr_build_date();
VOLK_API char* volk_gnsssdr_version();
VOLK_API char* volk_gnsssdr_c_compiler();
VOLK_API char* volk_gnsssdr_compiler_flags();
VOLK_API char* volk_gnsssdr_available_machines();
__VOLK_DECL_END
#endif /* INCLUDED_VOLK_CONSTANTS_H */

View File

@ -0,0 +1,96 @@
#ifndef INCLUDED_LIBVOLK_COMMON_H
#define INCLUDED_LIBVOLK_COMMON_H
////////////////////////////////////////////////////////////////////////
// Cross-platform attribute macros
////////////////////////////////////////////////////////////////////////
#if defined __GNUC__
# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
# define __VOLK_ATTR_UNUSED __attribute__((unused))
# define __VOLK_ATTR_INLINE __attribute__((always_inline))
# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
# if __GNUC__ >= 4
# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
# else
# define __VOLK_ATTR_EXPORT
# define __VOLK_ATTR_IMPORT
# endif
#elif _MSC_VER
# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
# define __VOLK_ATTR_UNUSED
# define __VOLK_ATTR_INLINE __forceinline
# define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
# define __VOLK_ATTR_EXPORT __declspec(dllexport)
# define __VOLK_ATTR_IMPORT __declspec(dllimport)
#else
# define __VOLK_ATTR_ALIGNED(x)
# define __VOLK_ATTR_UNUSED
# define __VOLK_ATTR_INLINE
# define __VOLK_ATTR_DEPRECATED
# define __VOLK_ATTR_EXPORT
# define __VOLK_ATTR_IMPORT
#endif
////////////////////////////////////////////////////////////////////////
// Ignore annoying warnings in MSVC
////////////////////////////////////////////////////////////////////////
#if defined(_MSC_VER)
# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2'
#endif
////////////////////////////////////////////////////////////////////////
// C-linkage declaration macros
// FIXME: due to the usage of complex.h, require gcc for c-linkage
////////////////////////////////////////////////////////////////////////
#if defined(__cplusplus) && (__GNUC__)
# define __VOLK_DECL_BEGIN extern "C" {
# define __VOLK_DECL_END }
#else
# define __VOLK_DECL_BEGIN
# define __VOLK_DECL_END
#endif
////////////////////////////////////////////////////////////////////////
// Define VOLK_API for library symbols
// http://gcc.gnu.org/wiki/Visibility
////////////////////////////////////////////////////////////////////////
#ifdef volk_gnsssdr_EXPORTS
# define VOLK_API __VOLK_ATTR_EXPORT
#else
# define VOLK_API __VOLK_ATTR_IMPORT
#endif
////////////////////////////////////////////////////////////////////////
// The bit128 union used by some
////////////////////////////////////////////////////////////////////////
#include <inttypes.h>
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
#endif
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
#endif
union bit128{
uint16_t i16[8];
uint32_t i[4];
float f[4];
double d[2];
#ifdef LV_HAVE_SSE
__m128 float_vec;
#endif
#ifdef LV_HAVE_SSE2
__m128i int_vec;
__m128d double_vec;
#endif
};
#define bit128_p(x) ((union bit128 *)(x))
#endif /*INCLUDED_LIBVOLK_COMMON_H*/

View File

@ -0,0 +1,86 @@
#ifndef INCLUDE_VOLK_COMPLEX_H
#define INCLUDE_VOLK_COMPLEX_H
/*!
* \brief Provide typedefs and operators for all complex types in C and C++.
*
* The typedefs encompass all signed integer and floating point types.
* Each operator function is intended to work across all data types.
* Under C++, these operators are defined as inline templates.
* Under C, these operators are defined as preprocessor macros.
* The use of macros makes the operators agnostic to the type.
*
* The following operator functions are defined:
* - lv_cmake - make a complex type from components
* - lv_creal - get the real part of the complex number
* - lv_cimag - get the imaginary part of the complex number
* - lv_conj - take the conjugate of the complex number
*/
#ifdef __cplusplus
#include <complex>
#include <stdint.h>
typedef std::complex<int8_t> lv_8sc_t;
typedef std::complex<int16_t> lv_16sc_t;
typedef std::complex<int32_t> lv_32sc_t;
typedef std::complex<int64_t> lv_64sc_t;
typedef std::complex<float> lv_32fc_t;
typedef std::complex<double> lv_64fc_t;
template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){
return std::complex<T>(r, i);
}
template <typename T> inline typename T::value_type lv_creal(const T &x){
return x.real();
}
template <typename T> inline typename T::value_type lv_cimag(const T &x){
return x.imag();
}
template <typename T> inline T lv_conj(const T &x){
return std::conj(x);
}
#else /* __cplusplus */
#include <complex.h>
typedef char complex lv_8sc_t;
typedef short complex lv_16sc_t;
typedef long complex lv_32sc_t;
typedef long long complex lv_64sc_t;
typedef float complex lv_32fc_t;
typedef double complex lv_64fc_t;
#define lv_cmake(r, i) ((r) + _Complex_I*(i))
// When GNUC is available, use the complex extensions.
// The extensions always return the correct value type.
// http://gcc.gnu.org/onlinedocs/gcc/Complex.html
#ifdef __GNUC__
#define lv_creal(x) (__real__(x))
#define lv_cimag(x) (__imag__(x))
#define lv_conj(x) (~(x))
// When not available, use the c99 complex function family,
// which always returns double regardless of the input type.
#else /* __GNUC__ */
#define lv_creal(x) (creal(x))
#define lv_cimag(x) (cimag(x))
#define lv_conj(x) (conj(x))
#endif /* __GNUC__ */
#endif /* __cplusplus */
#endif /* INCLUDE_VOLK_COMPLEX_H */

View File

@ -0,0 +1,66 @@
/* -*- c -*- */
/*
* Copyright 2014 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#ifndef INCLUDED_VOLK_MALLOC_H
#define INCLUDED_VOLK_MALLOC_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <stdlib.h>
__VOLK_DECL_BEGIN
/*!
* \brief Allocate \p size bytes of data aligned to \p alignment.
*
* \details
* Because we don't have a standard method to allocate buffers in
* memory that are guaranteed to be on an alignment, VOLK handles this
* itself. The volk_gnsssdr_malloc function behaves like malloc in that it
* returns a pointer to the allocated memory. However, it also takes
* in an alignment specfication, which is usually something like 16 or
* 32 to ensure that the aligned memory is located on a particular
* byte boundary for use with SIMD.
*
* Internally, the volk_gnsssdr_malloc first checks if the compiler is C11
* compliant and uses the new aligned_alloc method. If not, it checks
* if the system is POSIX compliant and uses posix_memalign. If that
* fails, volk_gnsssdr_malloc handles the memory allocation and alignment
* internally.
*
* Because of the ways in which volk_gnsssdr_malloc may allocate memory, it is
* important to always free volk_gnsssdr_malloc pointers using volk_gnsssdr_free.
*
* \param size The number of bytes to allocate.
* \param alignment The byte alignment of the allocated memory.
* \return pointer to aligned memory.
*/
VOLK_API void *volk_gnsssdr_malloc(size_t size, size_t alignment);
/*!
* \brief Free's memory allocated by volk_gnsssdr_malloc.
* \param aptr The aligned pointer allocaed by volk_gnsssdr_malloc.
*/
VOLK_API void volk_gnsssdr_free(void *aptr);
__VOLK_DECL_END
#endif /* INCLUDED_VOLK_MALLOC_H */

View File

@ -0,0 +1,28 @@
#ifndef INCLUDED_VOLK_PREFS_H
#define INCLUDED_VOLK_PREFS_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <stdlib.h>
__VOLK_DECL_BEGIN
typedef struct volk_gnsssdr_arch_pref
{
char name[128]; //name of the kernel
char impl_a[128]; //best aligned impl
char impl_u[128]; //best unaligned impl
} volk_gnsssdr_arch_pref_t;
////////////////////////////////////////////////////////////////////////
// get path to volk_gnsssdr_config profiling info
////////////////////////////////////////////////////////////////////////
VOLK_API void volk_gnsssdr_get_config_path(char *);
////////////////////////////////////////////////////////////////////////
// load prefs into global prefs struct
////////////////////////////////////////////////////////////////////////
VOLK_API size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **);
__VOLK_DECL_END
#endif //INCLUDED_VOLK_PREFS_H

View File

@ -0,0 +1,174 @@
/*!
* \file CommonMacros.h
* \brief Common macros used inside the volk protokernels.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_CommonMacros_u_H
#define INCLUDED_gnsssdr_CommonMacros_u_H
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for U_SSE4_1
*/
#ifndef CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1
#define CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(input1, input2, real, imag)\
imag = _mm_srli_si128 (input1, 2);\
imag = _mm_blend_epi16 (input2, imag, 85);\
real = _mm_slli_si128 (input2, 2);\
real = _mm_blend_epi16 (real, input1, 85);
#endif /* CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1 */
#ifndef CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1
#define CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_ps)\
input_i_1 = _mm_cvtepi16_epi32(input);\
input = _mm_srli_si128 (input, 8);\
input_i_2 = _mm_cvtepi16_epi32(input);\
output_i32 = _mm_add_epi32 (input_i_1, input_i_2);\
output_ps = _mm_cvtepi32_ps(output_i32);
#endif /* CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
#ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1
#define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(input, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
input_i_1 = _mm_cvtepi8_epi32(input);\
input = _mm_srli_si128 (input, 4);\
input_i_2 = _mm_cvtepi8_epi32(input);\
input = _mm_srli_si128 (input, 4);\
output_i32_1 = _mm_add_epi32 (input_i_1, input_i_2);\
input_i_1 = _mm_cvtepi8_epi32(input);\
input = _mm_srli_si128 (input, 4);\
input_i_2 = _mm_cvtepi8_epi32(input);\
input = _mm_srli_si128 (input, 4);\
output_i32_2 = _mm_add_epi32 (input_i_1, input_i_2);\
output_i32 = _mm_add_epi32 (output_i32_1, output_i32_2);\
output_ps = _mm_cvtepi32_ps(output_i32);
#endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1 */
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
/*!
\brief Macros for U_SSE2
*/
#ifdef LV_HAVE_SSSE3
/*!
\brief Macros for U_SSSE3
*/
#ifndef CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3
#define CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)\
y_aux = _mm_sign_epi8 (y, x);\
y_aux = _mm_sign_epi8 (y_aux, check_sign_sequence);\
real_output = _mm_maddubs_epi16 (x_abs, y_aux);\
\
y_aux = _mm_shuffle_epi8 (y, rearrange_sequence);\
y_aux = _mm_sign_epi8 (y_aux, x);\
imag_output = _mm_maddubs_epi16 (x_abs, y_aux);
#endif /* CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3 */
#endif /* LV_HAVE_SSSE3 */
#ifndef CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2
#define CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
realx_mult_realy = _mm_mullo_epi16 (realx, realy);\
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);\
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);\
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);\
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);\
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
#endif /* CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2 */
#ifndef CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2
#define CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(input, mult1, real, imag)\
imag = _mm_srli_si128 (input, 1);\
imag = _mm_and_si128 (imag, mult1);\
real = _mm_and_si128 (input, mult1);
#endif /* CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2 */
#ifndef CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2
#define CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(input, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
input_i_1 = _mm_unpacklo_epi8(_mm_setzero_si128(), input);\
input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\
input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\
input_i_1 = _mm_srai_epi32(input_i_1, 24);\
input_i_2 = _mm_srai_epi32(input_i_2, 24);\
output_i32 = _mm_add_epi32(input_i_1, input_i_2);\
output_ps_1 = _mm_cvtepi32_ps(output_i32);\
\
input_i_1 = _mm_unpackhi_epi8(_mm_setzero_si128(), input);\
input_i_2 = _mm_unpacklo_epi16(_mm_setzero_si128(), input_i_1);\
input_i_1 = _mm_unpackhi_epi16(_mm_setzero_si128(), input_i_1);\
input_i_1 = _mm_srai_epi32(input_i_1, 24);\
input_i_2 = _mm_srai_epi32(input_i_2, 24);\
output_i32 = _mm_add_epi32(input_i_1, input_i_2);\
output_ps_2 = _mm_cvtepi32_ps(output_i32);
#endif /* CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2 */
#ifndef CM_8IC_CONTROLMINUS128_8IC_U_SSE2
#define CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\
minus128control = _mm_cmpeq_epi8 (y, minus128);\
y = _mm_sub_epi8 (y, minus128control);
#endif /* CM_8IC_CONTROLMINUS128_8IC_U_SSE2 */
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for U_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_u_H */
#ifndef INCLUDED_gnsssdr_CommonMacros_a_H
#define INCLUDED_gnsssdr_CommonMacros_a_H
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for A_SSE4_1
*/
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
/*!
\brief Macros for U_SSE2
*/
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for A_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_a_H */

View File

@ -0,0 +1,76 @@
/*!
* \file CommonMacros_16ic_cw_corr_32fc.h
* \brief Common macros used inside the 16ic_cw_corr_32fc volk protokernels.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H
#include "CommonMacros/CommonMacros.h"
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for U_SSE4_1
*/
#ifndef CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1
#define CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)\
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
#endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for U_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_u_H */
#ifndef INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
#define INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for A_SSE4_1
*/
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for A_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_16ic_cw_corr_32fc_a_H */

View File

@ -0,0 +1,114 @@
/*!
* \file CommonMacros_8ic_cw_corr_32fc.h
* \brief Common macros used inside the 8ic_cw_corr_32fc volk protokernels.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H
#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H
#include "CommonMacros/CommonMacros.h"
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for U_SSE4_1
*/
#ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1
#define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)\
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
\
imag_output = _mm_slli_si128 (imag_output, 1);\
output = _mm_blendv_epi8 (imag_output, real_output, mult1);\
\
CM_8IC_CONVERT_AND_ACC_32FC_U_SSE4_1(output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
#endif /* CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1 */
#ifndef CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1
#define CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
CM_8IC_CONTROLMINUS128_8IC_U_SSE2(y, minus128, minus128control)\
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\
CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
#endif /* CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1 */
#ifndef CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1
#define CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)\
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output)\
CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(real_output, input_i_1, input_i_2, output_i32, real_output_ps)\
CM_16IC_CONVERT_AND_ACC_32FC_U_SSE4_1(imag_output, input_i_1, input_i_2, output_i32, imag_output_ps)
#endif /* CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1 */
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
/*!
\brief Macros for U_SSE2
*/
#ifndef CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2
#define CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)\
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)\
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(real_bb_signal_sample, imag_bb_signal_sample, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)\
\
real_output = _mm_and_si128 (real_output, mult1);\
imag_output = _mm_and_si128 (imag_output, mult1);\
imag_output = _mm_slli_si128 (imag_output, 1);\
output = _mm_or_si128 (real_output, imag_output);\
\
CM_8IC_CONVERT_AND_ACC_32FC_U_SSE2(output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
#endif /* CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2 */
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for U_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_u_H */
#ifndef INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H
#define INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H
#ifdef LV_HAVE_SSE4_1
/*!
\brief Macros for A_SSE4_1
*/
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Macros for A_GENERIC
*/
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_CommonMacros_8ic_cw_corr_32fc_a_H */

View File

@ -0,0 +1,34 @@
####################################################################
Common Macros inside volk_gnsssdr module
####################################################################
First of all, sorry for making you need to read this: macros are evil, they can not be debugged, you do not know where the errors come from, syntax is annoying.. BUT this is the only way I found that allows to share one piece of code between various proto-kernels without performance penalties.
Inline functions have been tested, and they introduce a really small time penalty, but it becomes huge because of long loops, with thousands of samples.
####################################################################
Syntax
####################################################################
In order to allow better understanding of the code I created the macros with an specific syntax.
1) Inside CommonMacros.h you will find macros for common operations. I will explain the syntax with an example:
example: CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output)
First of all, you find the characters “CM”, which means CommonMacros. After that the type and the amount of inputs is placed: “_16IC_X4” (16 bits complex integers, four inputs). The syntax for type is the same as the one used with volk protokernels, refer to GNURadio documentation for more help. The it comes the name of the macro (“_SCALAR_PRODUCT”), and after that the type and the amount of outputs (“_16IC_X2”). Finally it is placed the SSE minimum version needed to run (“_U_SSE2”). In the arguments you will find (from left to right) the inputs (four inputs: realx, imagx, realy, imagy), some variables that the macro needs to work (realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy) and finally the outputs (two outputs: real_output, imag_output).
The variables that the macro needs are specified when calling it in order to avoid after-compile problems: if you want to use a macro you will need to declare all the variables it needs before, or you will not be able to compile.
2) Inside all the other headers, CommonMacros_XXXXXX.h you will find macros for a specific group of proto-kernels. The syntax is the same as the CommonMacros.h
####################################################################
Workflow
####################################################################
In order to use the macros easily, I usually test the code without macros inside a testing proto-kernel, where you are able to test it, debug it and use breakpoints.
When it works I place code inside a macro an I test it again.
####################################################################
Why macros
####################################################################
1) They are the only way I could find for sharing code between proto-kernels without performance penalty.
2) It is true that they are really difficult to debug, but if you work with them responsibly it is not so hard. Volk_gnsssdr checks all the SSE proto-kernels implementations results against the generic implementation results, so if your macro is not working you will appreciate it after profiling it.

View File

@ -0,0 +1,67 @@
########################################################################
# How to create custom kernel dispatchers
########################################################################
A kernel dispatcher is kernel implementation that calls other kernel implementations.
By default, a dispatcher is generated by the build system for every kernel such that:
* the best aligned implemention is called when all pointer arguments are aligned,
* and otherwise the best unaligned implementation is called.
The author of a VOLK kernel may create a custom dispatcher,
to be called in place of the automatically generated one.
A custom dispatcher may be useful to handle head and tail cases,
or to implement different alignment and bounds checking logic.
########################################################################
# Code for an example dispatcher w/ tail case
########################################################################
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#ifdef LV_HAVE_DISPATCHER
static inline void volk_gnsssdr_32f_x2_add_32f_dispatcher(float* cVector, const float* aVector, const float* bVector, unsigned int num_points)
{
const unsigned int num_points_r = num_points%4;
const unsigned int num_points_x = num_points - num_points_r;
if (volk_gnsssdr_is_aligned(VOLK_OR_PTR(cVector, VOLK_OR_PTR(aVector, bVector))))
{
volk_gnsssdr_32f_x2_add_32f_a(cVector, aVector, bVector, num_points_x);
}
else
{
volk_gnsssdr_32f_x2_add_32f_u(cVector, aVector, bVector, num_points_x);
}
volk_gnsssdr_32f_x2_add_32f_g(cVector+num_points_x, aVector+num_points_x, bVector+num_points_x, num_points_r);
}
#endif //LV_HAVE_DISPATCHER
########################################################################
# Code for an example dispatcher w/ tail case and accumulator
########################################################################
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#ifdef LV_HAVE_DISPATCHER
static inline void volk_gnsssdr_32f_x2_dot_prod_32f_dispatcher(float * result, const float * input, const float * taps, unsigned int num_points)
{
const unsigned int num_points_r = num_points%16;
const unsigned int num_points_x = num_points - num_points_r;
if (volk_gnsssdr_is_aligned(VOLK_OR_PTR(input, taps)))
{
volk_gnsssdr_32f_x2_dot_prod_32f_a(result, input, taps, num_points_x);
}
else
{
volk_gnsssdr_32f_x2_dot_prod_32f_u(result, input, taps, num_points_x);
}
float result_tail = 0;
volk_gnsssdr_32f_x2_dot_prod_32f_g(&result_tail, input+num_points_x, taps+num_points_x, num_points_r);
*result += result_tail;
}
#endif //LV_HAVE_DISPATCHER

View File

@ -0,0 +1,241 @@
#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
/*!
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
\param inputVector The 16 bit input data buffer
\param outputVector The floating point output data buffer
\param scalar The value divided against each point in the output buffer
\param num_points The number of data values to be converted
\note Output buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
float* outputVectorPtr = outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
int16_t* inputPtr = (int16_t*)inputVector;
__m128i inputVal;
__m128i inputVal2;
__m128 ret;
for(;number < eighthPoints; number++){
// Load the 8 values
inputVal = _mm_loadu_si128((__m128i*)inputPtr);
// Shift the input data to the right by 64 bits ( 8 bytes )
inputVal2 = _mm_srli_si128(inputVal, 8);
// Convert the lower 4 values into 32 bit words
inputVal = _mm_cvtepi16_epi32(inputVal);
inputVal2 = _mm_cvtepi16_epi32(inputVal2);
ret = _mm_cvtepi32_ps(inputVal);
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
outputVectorPtr += 4;
ret = _mm_cvtepi32_ps(inputVal2);
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
outputVectorPtr += 4;
inputPtr += 8;
}
number = eighthPoints * 8;
for(; number < num_points; number++){
outputVector[number] =((float)(inputVector[number])) / scalar;
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
\param inputVector The 16 bit input data buffer
\param outputVector The floating point output data buffer
\param scalar The value divided against each point in the output buffer
\param num_points The number of data values to be converted
\note Output buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_16i_s32f_convert_32f_u_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
float* outputVectorPtr = outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
int16_t* inputPtr = (int16_t*)inputVector;
__m128 ret;
for(;number < quarterPoints; number++){
ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
inputPtr += 4;
outputVectorPtr += 4;
}
number = quarterPoints * 4;
for(; number < num_points; number++){
outputVector[number] = (float)(inputVector[number]) / scalar;
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
\param inputVector The 16 bit input data buffer
\param outputVector The floating point output data buffer
\param scalar The value divided against each point in the output buffer
\param num_points The number of data values to be converted
\note Output buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_16i_s32f_convert_32f_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
float* outputVectorPtr = outputVector;
const int16_t* inputVectorPtr = inputVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_u_H */
#ifndef INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
#define INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
/*!
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
\param inputVector The 16 bit input data buffer
\param outputVector The floating point output data buffer
\param scalar The value divided against each point in the output buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse4_1(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
float* outputVectorPtr = outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
int16_t* inputPtr = (int16_t*)inputVector;
__m128i inputVal;
__m128i inputVal2;
__m128 ret;
for(;number < eighthPoints; number++){
// Load the 8 values
inputVal = _mm_loadu_si128((__m128i*)inputPtr);
// Shift the input data to the right by 64 bits ( 8 bytes )
inputVal2 = _mm_srli_si128(inputVal, 8);
// Convert the lower 4 values into 32 bit words
inputVal = _mm_cvtepi16_epi32(inputVal);
inputVal2 = _mm_cvtepi16_epi32(inputVal2);
ret = _mm_cvtepi32_ps(inputVal);
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
outputVectorPtr += 4;
ret = _mm_cvtepi32_ps(inputVal2);
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
outputVectorPtr += 4;
inputPtr += 8;
}
number = eighthPoints * 8;
for(; number < num_points; number++){
outputVector[number] =((float)(inputVector[number])) / scalar;
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
\param inputVector The 16 bit input data buffer
\param outputVector The floating point output data buffer
\param scalar The value divided against each point in the output buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_16i_s32f_convert_32f_a_sse(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
float* outputVectorPtr = outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
int16_t* inputPtr = (int16_t*)inputVector;
__m128 ret;
for(;number < quarterPoints; number++){
ret = _mm_set_ps((float)(inputPtr[3]), (float)(inputPtr[2]), (float)(inputPtr[1]), (float)(inputPtr[0]));
ret = _mm_mul_ps(ret, invScalar);
_mm_storeu_ps(outputVectorPtr, ret);
inputPtr += 4;
outputVectorPtr += 4;
}
number = quarterPoints * 4;
for(; number < num_points; number++){
outputVector[number] = (float)(inputVector[number]) / scalar;
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts the input 16 bit integer data into floating point data, and divides the each floating point output data point by the scalar value
\param inputVector The 16 bit input data buffer
\param outputVector The floating point output data buffer
\param scalar The value divided against each point in the output buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_16i_s32f_convert_32f_a_generic(float* outputVector, const int16_t* inputVector, const float scalar, unsigned int num_points){
float* outputVectorPtr = outputVector;
const int16_t* inputVectorPtr = inputVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_16i_s32f_convert_32f_a_H */

View File

@ -0,0 +1,461 @@
/*!
* \file volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 32 bits vectors
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Early, Prompt, and Late correlation with 32 bits vectors (16 bits the
* real part and 16 bits the imaginary part):
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 32 bits vectors) It returns the input
* signal in base band (BB)
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 32 bits vectors), accumulating the results
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 32 bits vectors), accumulating the results
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 32 bits vectors), accumulating the results
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
__m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
const lv_16sc_t* input_ptr = input;
const lv_16sc_t* carrier_ptr = carrier;
const lv_16sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_16sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_16sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_lddqu_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_lddqu_si128((__m128i*)input_ptr);
y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
//Adds the float 32 results
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
}
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t tmp1;
lv_16sc_t tmp2;
lv_16sc_t tmp3;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * E_code[i];
tmp2 = bb_signal_sample * P_code[i];
tmp3 = bb_signal_sample * L_code[i];
// Now get early, late, and prompt values for each
*E_out += (lv_32fc_t)tmp1;
*P_out += (lv_32fc_t)tmp2;
*L_out += (lv_32fc_t)tmp3;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
__m128 real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
const lv_16sc_t* input_ptr = input;
const lv_16sc_t* carrier_ptr = carrier;
const lv_16sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_16sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_16sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_load_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_load_si128((__m128i*)input_ptr);
y1 = _mm_load_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_load_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y1 = _mm_load_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
//Adds the float 32 results
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_load_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_load_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
}
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, unsigned int num_points)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t tmp1;
lv_16sc_t tmp2;
lv_16sc_t tmp3;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * E_code[i];
tmp2 = bb_signal_sample * P_code[i];
tmp3 = bb_signal_sample * L_code[i];
// Now get early, late, and prompt values for each
*E_out += (lv_32fc_t)tmp1;
*P_out += (lv_32fc_t)tmp2;
*L_out += (lv_32fc_t)tmp3;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3_a_H */

View File

@ -0,0 +1,595 @@
/*!
* \file volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 32 bits vectors and returns float32 values.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Very Early, Early, Prompt, Late and Very Late correlation with 32 bits vectors (16 bits the
* real part and 16 bits the imaginary part) and accumulates into float32 values, returning them:
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 32 bits vectors) It returns the input
* signal in base band (BB)
* - Very Early values are calculated by multiplying the input signal in BB by the
* very early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
* - Very Late values are calculated by multiplying the input signal in BB by the
* very late code (multiplication of 32 bits vectors), converting that to float32 and accumulating the results
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
__m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
float VE_out_real = 0;
float VE_out_imag = 0;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
float VL_out_real = 0;
float VL_out_imag = 0;
const lv_16sc_t* input_ptr = input;
const lv_16sc_t* carrier_ptr = carrier;
const lv_16sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_16sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_16sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_16sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_16sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
*VE_out_ptr = 0;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
*VL_out_ptr = 0;
real_VE_code_acc = _mm_setzero_ps();
imag_VE_code_acc = _mm_setzero_ps();
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
real_VL_code_acc = _mm_setzero_ps();
imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_lddqu_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_lddqu_si128((__m128i*)input_ptr);
y1 = _mm_lddqu_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y1 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
VE_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y1 = _mm_lddqu_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_lddqu_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_lddqu_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y1 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
VL_code_ptr += 4;
y2 = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
VE_code_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
VL_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t tmp1;
lv_16sc_t tmp2;
lv_16sc_t tmp3;
lv_16sc_t tmp4;
lv_16sc_t tmp5;
bb_signal_sample = lv_cmake(0, 0);
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * VE_code[i];
tmp2 = bb_signal_sample * E_code[i];
tmp3 = bb_signal_sample * P_code[i];
tmp4 = bb_signal_sample * L_code[i];
tmp5 = bb_signal_sample * VL_code[i];
// Now get early, late, and prompt values for each
*VE_out += (lv_32fc_t)tmp1;
*E_out += (lv_32fc_t)tmp2;
*P_out += (lv_32fc_t)tmp3;
*L_out += (lv_32fc_t)tmp4;
*VL_out += (lv_32fc_t)tmp5;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_16ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x1, x2, y1, y2, real_bb_signal_sample, imag_bb_signal_sample;
__m128i realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output;
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
float VE_out_real = 0;
float VE_out_imag = 0;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
float VL_out_real = 0;
float VL_out_imag = 0;
const lv_16sc_t* input_ptr = input;
const lv_16sc_t* carrier_ptr = carrier;
const lv_16sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_16sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_16sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_16sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_16sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
*VE_out_ptr = 0;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
*VL_out_ptr = 0;
real_VE_code_acc = _mm_setzero_ps();
imag_VE_code_acc = _mm_setzero_ps();
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
real_VL_code_acc = _mm_setzero_ps();
imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x1 = _mm_load_si128((__m128i*)input_ptr);
input_ptr += 4;
x2 = _mm_load_si128((__m128i*)input_ptr);
y1 = _mm_load_si128((__m128i*)carrier_ptr);
carrier_ptr += 4;
y2 = _mm_load_si128((__m128i*)carrier_ptr);
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(x1, x2, realx, imagx)
CM_16IC_X2_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE4_1(y1, y2, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y1 = _mm_load_si128((__m128i*)VE_code_ptr);
VE_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)VE_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y1 = _mm_load_si128((__m128i*)E_code_ptr);
E_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)E_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y1 = _mm_load_si128((__m128i*)P_code_ptr);
P_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)P_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y1 = _mm_load_si128((__m128i*)L_code_ptr);
L_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)L_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y1 = _mm_load_si128((__m128i*)VL_code_ptr);
VL_code_ptr += 4;
y2 = _mm_load_si128((__m128i*)VL_code_ptr);
CM_16IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y1, y2, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 4;
carrier_ptr += 4;
VE_code_ptr += 4;
E_code_ptr += 4;
P_code_ptr += 4;
L_code_ptr += 4;
VL_code_ptr += 4;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Very Early, Early, Prompt, Late and Very Vate correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_16sc_t* input, const lv_16sc_t* carrier, const lv_16sc_t* VE_code, const lv_16sc_t* E_code, const lv_16sc_t* P_code, const lv_16sc_t* L_code, const lv_16sc_t* VL_code, unsigned int num_points)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t tmp1;
lv_16sc_t tmp2;
lv_16sc_t tmp3;
lv_16sc_t tmp4;
lv_16sc_t tmp5;
bb_signal_sample = lv_cmake(0, 0);
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
tmp1 = bb_signal_sample * VE_code[i];
tmp2 = bb_signal_sample * E_code[i];
tmp3 = bb_signal_sample * P_code[i];
tmp4 = bb_signal_sample * L_code[i];
tmp5 = bb_signal_sample * VL_code[i];
// Now get early, late, and prompt values for each
*VE_out += (lv_32fc_t)tmp1;
*E_out += (lv_32fc_t)tmp2;
*P_out += (lv_32fc_t)tmp3;
*L_out += (lv_32fc_t)tmp4;
*VL_out += (lv_32fc_t)tmp5;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5_a_H */

View File

@ -0,0 +1,68 @@
#ifndef INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
#define INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_32f_accumulator_s32f_a_sse(float* result, const float* inputBuffer, unsigned int num_points){
float returnValue = 0;
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
__m128 accumulator = _mm_setzero_ps();
__m128 aVal = _mm_setzero_ps();
for(;number < quarterPoints; number++){
aVal = _mm_load_ps(aPtr);
accumulator = _mm_add_ps(accumulator, aVal);
aPtr += 4;
}
_mm_store_ps(tempBuffer,accumulator); // Store the results back into the C container
returnValue = tempBuffer[0];
returnValue += tempBuffer[1];
returnValue += tempBuffer[2];
returnValue += tempBuffer[3];
number = quarterPoints * 4;
for(;number < num_points; number++){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_32f_accumulator_s32f_generic(float* result, const float* inputBuffer, unsigned int num_points){
const float* aPtr = inputBuffer;
unsigned int number = 0;
float returnValue = 0;
for(;number < num_points; number++){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32f_accumulator_s32f_a_H */

View File

@ -0,0 +1,149 @@
#ifndef INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
#define INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE4_1
#include<smmintrin.h>
static inline void volk_gnsssdr_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points) {
if(num_points > 0){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
float max = src0[0];
float index = 0;
__m128 maxValues = _mm_set1_ps(max);
__m128 maxValuesIndex = _mm_setzero_ps();
__m128 compareResults;
__m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
for(;number < quarterPoints; number++){
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
}
// Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++){
if(maxValuesBuffer[number] > max){
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
}
}
number = quarterPoints * 4;
for(;number < num_points; number++){
if(src0[number] > max){
index = number;
max = src0[number];
}
}
target[0] = (unsigned int)index;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE
#include<xmmintrin.h>
static inline void volk_gnsssdr_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points) {
if(num_points > 0){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
float max = src0[0];
float index = 0;
__m128 maxValues = _mm_set1_ps(max);
__m128 maxValuesIndex = _mm_setzero_ps();
__m128 compareResults;
__m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
for(;number < quarterPoints; number++){
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
}
// Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++){
if(maxValuesBuffer[number] > max){
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
}
}
number = quarterPoints * 4;
for(;number < num_points; number++){
if(src0[number] > max){
index = number;
max = src0[number];
}
}
target[0] = (unsigned int)index;
}
}
#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points) {
if(num_points > 0){
float max = src0[0];
unsigned int index = 0;
unsigned int i = 1;
for(; i < num_points; ++i) {
if(src0[i] > max){
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_GENERIC*/
#endif /*INCLUDED_volk_gnsssdr_32f_index_max_16u_a_H*/

View File

@ -0,0 +1,302 @@
#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
\note Input buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scalar);
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(;number < eighthPoints; number++){
inputVal1 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
// Scale and clip
ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
number = eighthPoints * 8;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
\note Input buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_u_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scalar);
__m128 ret;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
for(;number < quarterPoints; number++){
ret = _mm_loadu_ps(inputVectorPtr);
inputVectorPtr += 4;
// Scale and clip
ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
_mm_store_ps(outputFloatBuffer, ret);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
}
number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
\note Input buffer does NOT need to be properly aligned
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
int16_t* outputVectorPtr = outputVector;
const float* inputVectorPtr = inputVector;
unsigned int number = 0;
float min_val = -32768;
float max_val = 32767;
float r;
for(number = 0; number < num_points; number++){
r = *inputVectorPtr++ * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
*outputVectorPtr++ = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_u_H */
#ifndef INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
#define INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H
#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse2(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int eighthPoints = num_points / 8;
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scalar);
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(;number < eighthPoints; number++){
inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
// Scale and clip
ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
number = eighthPoints * 8;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_sse(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* inputVectorPtr = (const float*)inputVector;
int16_t* outputVectorPtr = outputVector;
float min_val = -32768;
float max_val = 32767;
float r;
__m128 vScalar = _mm_set_ps1(scalar);
__m128 ret;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
__VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
for(;number < quarterPoints; number++){
ret = _mm_load_ps(inputVectorPtr);
inputVectorPtr += 4;
// Scale and clip
ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
_mm_store_ps(outputFloatBuffer, ret);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
*outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
}
number = quarterPoints * 4;
for(; number < num_points; number++){
r = inputVector[number] * scalar;
if(r > max_val)
r = max_val;
else if(r < min_val)
r = min_val;
outputVector[number] = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies each point in the input buffer by the scalar value, then converts the result into a 16 bit integer value
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param scalar The value multiplied against each point in the input buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32f_s32f_convert_16i_a_generic(int16_t* outputVector, const float* inputVector, const float scalar, unsigned int num_points){
int16_t* outputVectorPtr = outputVector;
const float* inputVectorPtr = inputVector;
unsigned int number = 0;
float min_val = -32768;
float max_val = 32767;
float r;
for(number = 0; number < num_points; number++){
r = *inputVectorPtr++ * scalar;
if(r < min_val)
r = min_val;
else if(r > max_val)
r = max_val;
*outputVectorPtr++ = (int16_t)rintf(r);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32f_s32f_convert_16i_a_H */

View File

@ -0,0 +1,147 @@
#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
static inline void volk_gnsssdr_32f_x2_add_32f_u_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
aVal = _mm_loadu_ps(aPtr);
bVal = _mm_loadu_ps(bPtr);
cVal = _mm_add_ps(aVal, bVal);
_mm_storeu_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
bPtr += 4;
cPtr += 4;
}
number = quarterPoints * 4;
for(;number < num_points; number++){
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
static inline void volk_gnsssdr_32f_x2_add_32f_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_u_H */
#ifndef INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
#define INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
static inline void volk_gnsssdr_32f_x2_add_32f_a_sse(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
__m128 aVal, bVal, cVal;
for(;number < quarterPoints; number++){
aVal = _mm_load_ps(aPtr);
bVal = _mm_load_ps(bPtr);
cVal = _mm_add_ps(aVal, bVal);
_mm_store_ps(cPtr,cVal); // Store the results back into the C container
aPtr += 4;
bPtr += 4;
cPtr += 4;
}
number = quarterPoints * 4;
for(;number < num_points; number++){
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
static inline void volk_gnsssdr_32f_x2_add_32f_a_generic(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
float* cPtr = cVector;
const float* aPtr = aVector;
const float* bPtr= bVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
extern void volk_gnsssdr_32f_x2_add_32f_a_orc_impl(float* cVector, const float* aVector, const float* bVector, unsigned int num_points);
static inline void volk_gnsssdr_32f_x2_add_32f_u_orc(float* cVector, const float* aVector, const float* bVector, unsigned int num_points){
volk_gnsssdr_32f_x2_add_32f_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_32f_x2_add_32f_a_H */

View File

@ -0,0 +1,127 @@
#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x;
lv_32fc_t* c = cVector;
const lv_32fc_t* a = aVector;
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
for(;number < halfPoints; number++){
x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
x = _mm_xor_ps(x, conjugator); // conjugate register
_mm_storeu_ps((float*)c,x); // Store the results back into the C container
a += 2;
c += 2;
}
if((num_points % 2) != 0) {
*c = lv_conj(*a);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = lv_conj(*aPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
#define INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x;
lv_32fc_t* c = cVector;
const lv_32fc_t* a = aVector;
__m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
for(;number < halfPoints; number++){
x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
x = _mm_xor_ps(x, conjugator); // conjugate register
_mm_store_ps((float*)c,x); // Store the results back into the C container
a += 2;
c += 2;
}
if((num_points % 2) != 0) {
*c = lv_conj(*a);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_conjugate_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, unsigned int num_points){
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = lv_conj(*aPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_conjugate_32fc_a_H */

View File

@ -0,0 +1,295 @@
/*!
* \file volk_gnsssdr_32fc_convert_16ic.h
* \brief Volk protokernel: converts float32 complex values to 16 integer complex values taking care of overflow
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
for(unsigned int i = 0; i < (num_points%4)*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
for(unsigned int i = 0; i < (num_points%4)*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
for(unsigned int i = 0; i < num_points*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
#define INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H
#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
for(unsigned int i = 0; i < (num_points%4)*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/4;
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2;
__m128 ret1, ret2;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
outputVectorPtr += 8;
}
for(unsigned int i = 0; i < (num_points%4)*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 32 integer vector (16 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_16ic_a_generic(lv_16sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
float* inputVectorPtr = (float*)inputVector;
int16_t* outputVectorPtr = (int16_t*)outputVector;
float min_val = -32768;
float max_val = 32767;
for(unsigned int i = 0; i < num_points*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int16_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_16ic_a_H */

View File

@ -0,0 +1,213 @@
/*!
* \file volk_gnsssdr_32fc_convert_8ic.h
* \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/8;
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
float min_val = -128;
float max_val = 127;
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
__m128i int8InputVal;
__m128 ret1, ret2, ret3, ret4;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal3 = _mm_cvtps_epi32(ret3);
intInputVal4 = _mm_cvtps_epi32(ret4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 16;
}
for(unsigned int i = 0; i < (num_points%4)*4; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
float min_val = -128;
float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
#define INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H
#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
const unsigned int sse_iters = num_points/8;
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
float min_val = -128;
float max_val = 127;
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
__m128i int8InputVal;
__m128 ret1, ret2, ret3, ret4;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal3 = _mm_cvtps_epi32(ret3);
intInputVal4 = _mm_cvtps_epi32(ret4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 16;
}
for(unsigned int i = 0; i < (num_points%4)*4; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, unsigned int num_points){
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
float min_val = -128;
float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++){
if(inputVectorPtr[i] > max_val)
inputVectorPtr[i] = max_val;
else if(inputVectorPtr[i] < min_val)
inputVectorPtr[i] = min_val;
outputVectorPtr[i] = (int8_t)rintf(inputVectorPtr[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_convert_8ic_a_H */

View File

@ -0,0 +1,228 @@
#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_u_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
__m128 cplxValue1, cplxValue2, result;
for(;number < quarterPoints; number++){
cplxValue1 = _mm_loadu_ps(complexVectorPtr);
complexVectorPtr += 4;
cplxValue2 = _mm_loadu_ps(complexVectorPtr);
complexVectorPtr += 4;
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
_mm_storeu_ps(magnitudeVectorPtr, result);
magnitudeVectorPtr += 4;
}
number = quarterPoints * 4;
for(; number < num_points; number++){
float val1Real = *complexVectorPtr++;
float val1Imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
__m128 cplxValue1, cplxValue2, iValue, qValue, result;
for(;number < quarterPoints; number++){
cplxValue1 = _mm_loadu_ps(complexVectorPtr);
complexVectorPtr += 4;
cplxValue2 = _mm_loadu_ps(complexVectorPtr);
complexVectorPtr += 4;
// Arrange in i1i2i3i4 format
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
// Arrange in q1q2q3q4 format
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
iValue = _mm_mul_ps(iValue, iValue); // Square the I values
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
_mm_storeu_ps(magnitudeVectorPtr, result);
magnitudeVectorPtr += 4;
}
number = quarterPoints * 4;
for(; number < num_points; number++){
float val1Real = *complexVectorPtr++;
float val1Imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
const float real = *complexVectorPtr++;
const float imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
#define INCLUDED_volk_gnsssdr_32fc_magnitude_squared_32f_a_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse3(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
__m128 cplxValue1, cplxValue2, result;
for(;number < quarterPoints; number++){
cplxValue1 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
cplxValue2 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
_mm_store_ps(magnitudeVectorPtr, result);
magnitudeVectorPtr += 4;
}
number = quarterPoints * 4;
for(; number < num_points; number++){
float val1Real = *complexVectorPtr++;
float val1Imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int quarterPoints = num_points / 4;
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
__m128 cplxValue1, cplxValue2, iValue, qValue, result;
for(;number < quarterPoints; number++){
cplxValue1 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
cplxValue2 = _mm_load_ps(complexVectorPtr);
complexVectorPtr += 4;
// Arrange in i1i2i3i4 format
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
// Arrange in q1q2q3q4 format
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
iValue = _mm_mul_ps(iValue, iValue); // Square the I values
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
_mm_store_ps(magnitudeVectorPtr, result);
magnitudeVectorPtr += 4;
}
number = quarterPoints * 4;
for(; number < num_points; number++){
float val1Real = *complexVectorPtr++;
float val1Imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Calculates the magnitude squared of the complexVector and stores the results in the magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_magnitude_squared_32f_a_generic(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
const float* complexVectorPtr = (float*)complexVector;
float* magnitudeVectorPtr = magnitudeVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
const float real = *complexVectorPtr++;
const float imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */

View File

@ -0,0 +1,231 @@
/*!
* \file volk_gnsssdr_32fc_s32f_convert_8ic.h
* \brief Volk protokernel: converts float32 complex values to 8 integer complex values taking care of overflow
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H
#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_u_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
const unsigned int sse_iters = num_points/8;
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
float min_val = -128;
float max_val = 127;
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
__m128i int8InputVal;
__m128 ret1, ret2, ret3, ret4;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, invScalar);
inputVal2 = _mm_mul_ps(inputVal2, invScalar);
inputVal3 = _mm_mul_ps(inputVal3, invScalar);
inputVal4 = _mm_mul_ps(inputVal4, invScalar);
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal3 = _mm_cvtps_epi32(ret3);
intInputVal4 = _mm_cvtps_epi32(ret4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
_mm_storeu_si128((__m128i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 16;
}
float scaled = 0;
for(unsigned int i = 0; i < (num_points%4)*4; i++){
scaled = inputVectorPtr[i]/scalar;
if(scaled > max_val)
scaled = max_val;
else if(scaled < min_val)
scaled = min_val;
outputVectorPtr[i] = (int8_t)rintf(scaled);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
float scaled = 0;
float min_val = -128;
float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++){
scaled = (inputVectorPtr[i])/scalar;
if(scaled > max_val)
scaled = max_val;
else if(scaled < min_val)
scaled = min_val;
outputVectorPtr[i] = (int8_t)rintf(scaled);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H
#define INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H
#include <volk/volk_common.h>
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_sse2(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
const unsigned int sse_iters = num_points/8;
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
__m128 invScalar = _mm_set_ps1(1.0/scalar);
float min_val = -128;
float max_val = 127;
__m128 inputVal1, inputVal2, inputVal3, inputVal4;
__m128i intInputVal1, intInputVal2, intInputVal3, intInputVal4;
__m128i int8InputVal;
__m128 ret1, ret2, ret3, ret4;
__m128 vmin_val = _mm_set_ps1(min_val);
__m128 vmax_val = _mm_set_ps1(max_val);
for(unsigned int i = 0;i < sse_iters; i++){
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, invScalar);
inputVal2 = _mm_mul_ps(inputVal2, invScalar);
inputVal3 = _mm_mul_ps(inputVal3, invScalar);
inputVal4 = _mm_mul_ps(inputVal4, invScalar);
// Clip
ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
ret3 = _mm_max_ps(_mm_min_ps(inputVal3, vmax_val), vmin_val);
ret4 = _mm_max_ps(_mm_min_ps(inputVal4, vmax_val), vmin_val);
intInputVal1 = _mm_cvtps_epi32(ret1);
intInputVal2 = _mm_cvtps_epi32(ret2);
intInputVal3 = _mm_cvtps_epi32(ret3);
intInputVal4 = _mm_cvtps_epi32(ret4);
intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
intInputVal2 = _mm_packs_epi32(intInputVal3, intInputVal4);
int8InputVal = _mm_packs_epi16(intInputVal1, intInputVal2);
_mm_store_si128((__m128i*)outputVectorPtr, int8InputVal);
outputVectorPtr += 16;
}
float scaled = 0;
for(unsigned int i = 0; i < (num_points%4)*4; i++){
scaled = inputVectorPtr[i]/scalar;
if(scaled > max_val)
scaled = max_val;
else if(scaled < min_val)
scaled = min_val;
outputVectorPtr[i] = (int8_t)rintf(scaled);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Converts a float vector of 64 bits (32 bits each part) into a 16 integer vector (8 bits each part)
\param inputVector The floating point input data buffer
\param outputVector The 16 bit output data buffer
\param num_points The number of data values to be converted
*/
static inline void volk_gnsssdr_32fc_s32f_convert_8ic_a_generic(lv_8sc_t* outputVector, const lv_32fc_t* inputVector, const float scalar, unsigned int num_points){
float* inputVectorPtr = (float*)inputVector;
int8_t* outputVectorPtr = (int8_t*)outputVector;
float scaled = 0;
float min_val = -128;
float max_val = 127;
for(unsigned int i = 0; i < num_points*2; i++){
scaled = inputVectorPtr[i]/scalar;
if(scaled > max_val)
scaled = max_val;
else if(scaled < min_val)
scaled = min_val;
outputVectorPtr[i] = (int8_t)rintf(scaled);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_convert_8ic_a_H */

View File

@ -0,0 +1,266 @@
/*!
* \file volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc
* \brief Volk protokernel: replaces the tracking function for update_local_code
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that replaces the tracking function for update_local_code
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H
#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
/*!
\brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
// *pointer1 = 1;
// float* pointer2 = (float*)&code_length_half_chips;
// *pointer2 = 6;
// float* pointer3 = (float*)&code_phase_step_half_chips;
// *pointer3 = 7;
// float* pointer4 = (float*)&tcode_half_chips_input;
// *pointer4 = 8;
const unsigned int sse_iters = num_points / 4;
__m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
__m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
__m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
__m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
__m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
__m128 twos = _mm_set1_ps (2);
__m128i associated_chip_index_array_int;
__VOLK_ATTR_ALIGNED(16) int32_t output[4];
for (unsigned int i = 0; i < sse_iters; i++)
{
//fmod = numer - tquot * denom; tquot = numer/denom truncated
//associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
_mm_storeu_si128 ((__m128i*)output, associated_chip_index_array_int);
//d_very_early_code[i] = d_ca_code[associated_chip_index];
*d_very_early_code++ = d_ca_code[output[0]];
*d_very_early_code++ = d_ca_code[output[1]];
*d_very_early_code++ = d_ca_code[output[2]];
*d_very_early_code++ = d_ca_code[output[3]];
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
}
if (num_points%4!=0)
{
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
_mm_storeu_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
int associated_chip_index;
float tcode_half_chips = tcode_half_chips_stored[0];
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points%4; i++)
{
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
d_very_early_code[i] = d_ca_code[associated_chip_index];
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
}
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
float* pointer1 = (float*)&d_very_early_late_spc_chips;
*pointer1 = 1;
float* pointer2 = (float*)&code_length_half_chips;
*pointer2 = 6;
float* pointer3 = (float*)&code_phase_step_half_chips;
*pointer3 = 7;
float* pointer4 = (float*)&tcode_half_chips_input;
*pointer4 = 8;
int associated_chip_index;
float tcode_half_chips = tcode_half_chips_input;
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points; i++)
{
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
d_very_early_code[i] = d_ca_code[associated_chip_index];
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H
#define INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
/*!
\brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_sse4_1(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
// *pointer1 = 1;
// float* pointer2 = (float*)&code_length_half_chips;
// *pointer2 = 6;
// float* pointer3 = (float*)&code_phase_step_half_chips;
// *pointer3 = 7;
// float* pointer4 = (float*)&tcode_half_chips_input;
// *pointer4 = 8;
const unsigned int sse_iters = num_points / 4;
__m128 tquot, fmod_num, fmod_result, associated_chip_index_array;
__m128 tcode_half_chips_array = _mm_set_ps (tcode_half_chips_input+3*code_phase_step_half_chips, tcode_half_chips_input+2*code_phase_step_half_chips, tcode_half_chips_input+code_phase_step_half_chips, tcode_half_chips_input);
__m128 code_phase_step_half_chips_array = _mm_set1_ps (code_phase_step_half_chips*4);
__m128 d_very_early_late_spc_chips_Multiplied_by_2 = _mm_set1_ps (2*d_very_early_late_spc_chips);
__m128 code_length_half_chips_array = _mm_set1_ps (code_length_half_chips);
__m128 twos = _mm_set1_ps (2);
__m128i associated_chip_index_array_int;
__VOLK_ATTR_ALIGNED(16) int32_t output[4];
for (unsigned int i = 0; i < sse_iters; i++)
{
//fmod = numer - tquot * denom; tquot = numer/denom truncated
//associated_chip_index = 2 + round(fmod(tcode_half_chips - 2*d_very_early_late_spc_chips, code_length_half_chips));
fmod_num = _mm_sub_ps (tcode_half_chips_array, d_very_early_late_spc_chips_Multiplied_by_2);
tquot = _mm_div_ps (fmod_num, code_length_half_chips_array);
tquot = _mm_round_ps (tquot, (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) );
fmod_result = _mm_sub_ps (fmod_num, _mm_mul_ps (tquot, code_length_half_chips_array));
associated_chip_index_array = _mm_round_ps (fmod_result, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC));
associated_chip_index_array = _mm_add_ps(twos, associated_chip_index_array);
associated_chip_index_array_int = _mm_cvtps_epi32 (associated_chip_index_array);
_mm_store_si128 ((__m128i*)output, associated_chip_index_array_int);
//d_very_early_code[i] = d_ca_code[associated_chip_index];
*d_very_early_code++ = d_ca_code[output[0]];
*d_very_early_code++ = d_ca_code[output[1]];
*d_very_early_code++ = d_ca_code[output[2]];
*d_very_early_code++ = d_ca_code[output[3]];
//tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
tcode_half_chips_array = _mm_add_ps (tcode_half_chips_array, code_phase_step_half_chips_array);
}
if (num_points%4!=0)
{
__VOLK_ATTR_ALIGNED(16) float tcode_half_chips_stored[4];
_mm_store_si128 ((__m128i*)tcode_half_chips_stored, tcode_half_chips_array);
int associated_chip_index;
float tcode_half_chips = tcode_half_chips_stored[0];
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points%4; i++)
{
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
d_very_early_code[i] = d_ca_code[associated_chip_index];
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
}
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Takes the conjugate of a complex vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of complex values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_generic(lv_32fc_t* d_very_early_code, const float d_very_early_late_spc_chips, const float code_length_half_chips, const float code_phase_step_half_chips, const float tcode_half_chips_input, const lv_32fc_t* d_ca_code, unsigned int num_points){
// float* pointer1 = (float*)&d_very_early_late_spc_chips;
// *pointer1 = 1;
// float* pointer2 = (float*)&code_length_half_chips;
// *pointer2 = 6;
// float* pointer3 = (float*)&code_phase_step_half_chips;
// *pointer3 = 7;
// float* pointer4 = (float*)&tcode_half_chips_input;
// *pointer4 = 8;
int associated_chip_index;
float tcode_half_chips = tcode_half_chips_input;
float d_very_early_late_spc_chips_multiplied_by_2 = 2*d_very_early_late_spc_chips;
for (unsigned int i = 0; i < num_points; i++)
{
associated_chip_index = 2 + round(fmod(tcode_half_chips - d_very_early_late_spc_chips_multiplied_by_2, code_length_half_chips));
d_very_early_code[i] = d_ca_code[associated_chip_index];
tcode_half_chips = tcode_half_chips + code_phase_step_half_chips;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc_a_H */

View File

@ -0,0 +1,178 @@
#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Multiplies the input vector by a scalar and stores the results in the third vector
\param cVector The vector where the results will be stored
\param aVector The vector to be multiplied
\param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x, yl, yh, z, tmp1, tmp2;
lv_32fc_t* c = cVector;
const lv_32fc_t* a = aVector;
// Set up constant scalar vector
yl = _mm_set_ps1(lv_creal(scalar));
yh = _mm_set_ps1(lv_cimag(scalar));
for(;number < halfPoints; number++){
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
_mm_storeu_ps((float*)c,z); // Store the results back into the C container
a += 2;
c += 2;
}
if((num_points % 2) != 0) {
*c = (*a) * scalar;
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the input vector by a scalar and stores the results in the third vector
\param cVector The vector where the results will be stored
\param aVector The vector to be multiplied
\param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
unsigned int number = num_points;
// unwrap loop
while (number >= 8){
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
number -= 8;
}
// clean up any remaining
while (number-- > 0)
*cPtr++ = *aPtr++ * scalar;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
#define INCLUDED_volk_gnsssdr_32fc_s32fc_multiply_32fc_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x, yl, yh, z, tmp1, tmp2;
lv_32fc_t* c = cVector;
const lv_32fc_t* a = aVector;
// Set up constant scalar vector
yl = _mm_set_ps1(lv_creal(scalar));
yh = _mm_set_ps1(lv_cimag(scalar));
for(;number < halfPoints; number++){
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
_mm_store_ps((float*)c,z); // Store the results back into the C container
a += 2;
c += 2;
}
if((num_points % 2) != 0) {
*c = (*a) * scalar;
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
unsigned int number = num_points;
// unwrap loop
while (number >= 8){
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
number -= 8;
}
// clean up any remaining
while (number-- > 0)
*cPtr++ = *aPtr++ * scalar;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */

View File

@ -0,0 +1,763 @@
#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <stdio.h>
#include <string.h>
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
float * res = (float*) result;
float * in = (float*) input;
float * tp = (float*) taps;
unsigned int n_2_ccomplex_blocks = num_points/2;
unsigned int isodd = num_points & 1;
float sum0[2] = {0,0};
float sum1[2] = {0,0};
unsigned int i = 0;
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
in += 4;
tp += 4;
}
res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1];
// Cleanup if we had an odd number of points
for(i = 0; i < isodd; ++i) {
*result += input[num_points - 1] * taps[num_points - 1];
}
}
#endif /*LV_HAVE_GENERIC*/
#if LV_HAVE_SSE && LV_HAVE_64
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
const unsigned int num_bytes = num_points*8;
unsigned int isodd = num_points & 1;
asm
(
"# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
"# const float *taps, unsigned num_bytes)\n\t"
"# float sum0 = 0;\n\t"
"# float sum1 = 0;\n\t"
"# float sum2 = 0;\n\t"
"# float sum3 = 0;\n\t"
"# do {\n\t"
"# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
"# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
"# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
"# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
"# input += 4;\n\t"
"# taps += 4; \n\t"
"# } while (--n_2_ccomplex_blocks != 0);\n\t"
"# result[0] = sum0 + sum2;\n\t"
"# result[1] = sum1 + sum3;\n\t"
"# TODO: prefetch and better scheduling\n\t"
" xor %%r9, %%r9\n\t"
" xor %%r10, %%r10\n\t"
" movq %%rcx, %%rax\n\t"
" movq %%rcx, %%r8\n\t"
" movq %[rsi], %%r9\n\t"
" movq %[rdx], %%r10\n\t"
" xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
" movups 0(%%r9), %%xmm0\n\t"
" xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
" movups 0(%%r10), %%xmm2\n\t"
" shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
" shr $4, %%r8\n\t"
" jmp .%=L1_test\n\t"
" # 4 taps / loop\n\t"
" # something like ?? cycles / loop\n\t"
".%=Loop1: \n\t"
"# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
"# movups (%%r9), %%xmmA\n\t"
"# movups (%%r10), %%xmmB\n\t"
"# movups %%xmmA, %%xmmZ\n\t"
"# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
"# mulps %%xmmB, %%xmmA\n\t"
"# mulps %%xmmZ, %%xmmB\n\t"
"# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
"# xorps %%xmmPN, %%xmmA\n\t"
"# movups %%xmmA, %%xmmZ\n\t"
"# unpcklps %%xmmB, %%xmmA\n\t"
"# unpckhps %%xmmB, %%xmmZ\n\t"
"# movups %%xmmZ, %%xmmY\n\t"
"# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
"# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
"# addps %%xmmZ, %%xmmA\n\t"
"# addps %%xmmA, %%xmmC\n\t"
"# A=xmm0, B=xmm2, Z=xmm4\n\t"
"# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
" movups 16(%%r9), %%xmm1\n\t"
" movups %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" movups 16(%%r10), %%xmm3\n\t"
" movups %%xmm1, %%xmm5\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm3, %%xmm1\n\t"
" shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
" addps %%xmm1, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" movups 32(%%r9), %%xmm0\n\t"
" addps %%xmm2, %%xmm7\n\t"
" mulps %%xmm5, %%xmm3\n\t"
" add $32, %%r9\n\t"
" movups 32(%%r10), %%xmm2\n\t"
" addps %%xmm3, %%xmm7\n\t"
" add $32, %%r10\n\t"
".%=L1_test:\n\t"
" dec %%rax\n\t"
" jge .%=Loop1\n\t"
" # We've handled the bulk of multiplies up to here.\n\t"
" # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
" # If so, we've got 2 more taps to do.\n\t"
" and $1, %%r8\n\t"
" je .%=Leven\n\t"
" # The count was odd, do 2 more taps.\n\t"
" # Note that we've already got mm0/mm2 preloaded\n\t"
" # from the main loop.\n\t"
" movups %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" addps %%xmm2, %%xmm7\n\t"
".%=Leven:\n\t"
" # neg inversor\n\t"
" xorps %%xmm1, %%xmm1\n\t"
" mov $0x80000000, %%r9\n\t"
" movd %%r9, %%xmm1\n\t"
" shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
" # pfpnacc\n\t"
" xorps %%xmm1, %%xmm6\n\t"
" movups %%xmm6, %%xmm2\n\t"
" unpcklps %%xmm7, %%xmm6\n\t"
" unpckhps %%xmm7, %%xmm2\n\t"
" movups %%xmm2, %%xmm3\n\t"
" shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
" shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
" addps %%xmm2, %%xmm6\n\t"
" # xmm6 = r1 i2 r3 i4\n\t"
" movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
" addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
" movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
:
:[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
:"rax", "r8", "r9", "r10"
);
if(isodd) {
*result += input[num_points - 1] * taps[num_points - 1];
}
return;
}
#endif /* LV_HAVE_SSE && LV_HAVE_64 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
lv_32fc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(float));
unsigned int number = 0;
const unsigned int halfPoints = num_points/2;
unsigned int isodd = num_points & 1;
__m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
const lv_32fc_t* a = input;
const lv_32fc_t* b = taps;
dotProdVal = _mm_setzero_ps();
for(;number < halfPoints; number++){
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
a += 2;
b += 2;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
_mm_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
dotProduct += ( dotProductVector[0] + dotProductVector[1] );
if(isodd) {
dotProduct += input[num_points - 1] * taps[num_points - 1];
}
*result = dotProduct;
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
unsigned int i = 0;
const unsigned int qtr_points = num_points/4;
const unsigned int isodd = num_points & 3;
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
float *p_input, *p_taps;
__m64 *p_result;
p_result = (__m64*)result;
p_input = (float*)input;
p_taps = (float*)taps;
static const __m128i neg = {0x000000000000000080000000};
real0 = _mm_setzero_ps();
real1 = _mm_setzero_ps();
im0 = _mm_setzero_ps();
im1 = _mm_setzero_ps();
for(; i < qtr_points; ++i) {
xmm0 = _mm_loadu_ps(p_input);
xmm1 = _mm_loadu_ps(p_taps);
p_input += 4;
p_taps += 4;
xmm2 = _mm_loadu_ps(p_input);
xmm3 = _mm_loadu_ps(p_taps);
p_input += 4;
p_taps += 4;
xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
//imaginary vector from input
xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
//real vector from input
xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
//imaginary vector from taps
xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
//real vector from taps
xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
real0 = _mm_add_ps(xmm4, real0);
real1 = _mm_add_ps(xmm5, real1);
im0 = _mm_add_ps(xmm6, im0);
im1 = _mm_add_ps(xmm7, im1);
}
real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
im0 = _mm_add_ps(im0, im1);
real0 = _mm_add_ps(real0, real1);
im0 = _mm_add_ps(im0, real0);
_mm_storel_pi(p_result, im0);
for(i = num_points-isodd; i < num_points; i++) {
*result += input[i] * taps[i];
}
}
#endif /*LV_HAVE_SSE4_1*/
#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_u_H*/
#ifndef INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
#define INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <stdio.h>
#include <string.h>
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
const unsigned int num_bytes = num_points*8;
float * res = (float*) result;
float * in = (float*) input;
float * tp = (float*) taps;
unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
unsigned int isodd = num_points & 1;
float sum0[2] = {0,0};
float sum1[2] = {0,0};
unsigned int i = 0;
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
in += 4;
tp += 4;
}
res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1];
for(i = 0; i < isodd; ++i) {
*result += input[num_points - 1] * taps[num_points - 1];
}
}
#endif /*LV_HAVE_GENERIC*/
#if LV_HAVE_SSE && LV_HAVE_64
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
const unsigned int num_bytes = num_points*8;
unsigned int isodd = num_points & 1;
asm
(
"# ccomplex_dotprod_generic (float* result, const float *input,\n\t"
"# const float *taps, unsigned num_bytes)\n\t"
"# float sum0 = 0;\n\t"
"# float sum1 = 0;\n\t"
"# float sum2 = 0;\n\t"
"# float sum3 = 0;\n\t"
"# do {\n\t"
"# sum0 += input[0] * taps[0] - input[1] * taps[1];\n\t"
"# sum1 += input[0] * taps[1] + input[1] * taps[0];\n\t"
"# sum2 += input[2] * taps[2] - input[3] * taps[3];\n\t"
"# sum3 += input[2] * taps[3] + input[3] * taps[2];\n\t"
"# input += 4;\n\t"
"# taps += 4; \n\t"
"# } while (--n_2_ccomplex_blocks != 0);\n\t"
"# result[0] = sum0 + sum2;\n\t"
"# result[1] = sum1 + sum3;\n\t"
"# TODO: prefetch and better scheduling\n\t"
" xor %%r9, %%r9\n\t"
" xor %%r10, %%r10\n\t"
" movq %%rcx, %%rax\n\t"
" movq %%rcx, %%r8\n\t"
" movq %[rsi], %%r9\n\t"
" movq %[rdx], %%r10\n\t"
" xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
" movaps 0(%%r9), %%xmm0\n\t"
" xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
" movaps 0(%%r10), %%xmm2\n\t"
" shr $5, %%rax # rax = n_2_ccomplex_blocks / 2\n\t"
" shr $4, %%r8\n\t"
" jmp .%=L1_test\n\t"
" # 4 taps / loop\n\t"
" # something like ?? cycles / loop\n\t"
".%=Loop1: \n\t"
"# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
"# movaps (%%r9), %%xmmA\n\t"
"# movaps (%%r10), %%xmmB\n\t"
"# movaps %%xmmA, %%xmmZ\n\t"
"# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
"# mulps %%xmmB, %%xmmA\n\t"
"# mulps %%xmmZ, %%xmmB\n\t"
"# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
"# xorps %%xmmPN, %%xmmA\n\t"
"# movaps %%xmmA, %%xmmZ\n\t"
"# unpcklps %%xmmB, %%xmmA\n\t"
"# unpckhps %%xmmB, %%xmmZ\n\t"
"# movaps %%xmmZ, %%xmmY\n\t"
"# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
"# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
"# addps %%xmmZ, %%xmmA\n\t"
"# addps %%xmmA, %%xmmC\n\t"
"# A=xmm0, B=xmm2, Z=xmm4\n\t"
"# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
" movaps 16(%%r9), %%xmm1\n\t"
" movaps %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" movaps 16(%%r10), %%xmm3\n\t"
" movaps %%xmm1, %%xmm5\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm3, %%xmm1\n\t"
" shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
" addps %%xmm1, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" movaps 32(%%r9), %%xmm0\n\t"
" addps %%xmm2, %%xmm7\n\t"
" mulps %%xmm5, %%xmm3\n\t"
" add $32, %%r9\n\t"
" movaps 32(%%r10), %%xmm2\n\t"
" addps %%xmm3, %%xmm7\n\t"
" add $32, %%r10\n\t"
".%=L1_test:\n\t"
" dec %%rax\n\t"
" jge .%=Loop1\n\t"
" # We've handled the bulk of multiplies up to here.\n\t"
" # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
" # If so, we've got 2 more taps to do.\n\t"
" and $1, %%r8\n\t"
" je .%=Leven\n\t"
" # The count was odd, do 2 more taps.\n\t"
" # Note that we've already got mm0/mm2 preloaded\n\t"
" # from the main loop.\n\t"
" movaps %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" addps %%xmm2, %%xmm7\n\t"
".%=Leven:\n\t"
" # neg inversor\n\t"
" xorps %%xmm1, %%xmm1\n\t"
" mov $0x80000000, %%r9\n\t"
" movd %%r9, %%xmm1\n\t"
" shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
" # pfpnacc\n\t"
" xorps %%xmm1, %%xmm6\n\t"
" movaps %%xmm6, %%xmm2\n\t"
" unpcklps %%xmm7, %%xmm6\n\t"
" unpckhps %%xmm7, %%xmm2\n\t"
" movaps %%xmm2, %%xmm3\n\t"
" shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
" shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
" addps %%xmm2, %%xmm6\n\t"
" # xmm6 = r1 i2 r3 i4\n\t"
" movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
" addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
" movlps %%xmm6, (%[rdi]) # store low 2x32 bits (complex) to memory\n\t"
:
:[rsi] "r" (input), [rdx] "r" (taps), "c" (num_bytes), [rdi] "r" (result)
:"rax", "r8", "r9", "r10"
);
if(isodd) {
*result += input[num_points - 1] * taps[num_points - 1];
}
return;
}
#endif
#if LV_HAVE_SSE && LV_HAVE_32
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
volk_gnsssdr_32fc_x2_dot_prod_32fc_a_generic(result, input, taps, num_points);
#if 0
const unsigned int num_bytes = num_points*8;
unsigned int isodd = num_points & 1;
asm volatile
(
" #pushl %%ebp\n\t"
" #movl %%esp, %%ebp\n\t"
" movl 12(%%ebp), %%eax # input\n\t"
" movl 16(%%ebp), %%edx # taps\n\t"
" movl 20(%%ebp), %%ecx # n_bytes\n\t"
" xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
" movaps 0(%%eax), %%xmm0\n\t"
" xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
" movaps 0(%%edx), %%xmm2\n\t"
" shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
" jmp .%=L1_test\n\t"
" # 4 taps / loop\n\t"
" # something like ?? cycles / loop\n\t"
".%=Loop1: \n\t"
"# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
"# movaps (%%eax), %%xmmA\n\t"
"# movaps (%%edx), %%xmmB\n\t"
"# movaps %%xmmA, %%xmmZ\n\t"
"# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
"# mulps %%xmmB, %%xmmA\n\t"
"# mulps %%xmmZ, %%xmmB\n\t"
"# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
"# xorps %%xmmPN, %%xmmA\n\t"
"# movaps %%xmmA, %%xmmZ\n\t"
"# unpcklps %%xmmB, %%xmmA\n\t"
"# unpckhps %%xmmB, %%xmmZ\n\t"
"# movaps %%xmmZ, %%xmmY\n\t"
"# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
"# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
"# addps %%xmmZ, %%xmmA\n\t"
"# addps %%xmmA, %%xmmC\n\t"
"# A=xmm0, B=xmm2, Z=xmm4\n\t"
"# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
" movaps 16(%%eax), %%xmm1\n\t"
" movaps %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" movaps 16(%%edx), %%xmm3\n\t"
" movaps %%xmm1, %%xmm5\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm3, %%xmm1\n\t"
" shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
" addps %%xmm1, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" movaps 32(%%eax), %%xmm0\n\t"
" addps %%xmm2, %%xmm7\n\t"
" mulps %%xmm5, %%xmm3\n\t"
" addl $32, %%eax\n\t"
" movaps 32(%%edx), %%xmm2\n\t"
" addps %%xmm3, %%xmm7\n\t"
" addl $32, %%edx\n\t"
".%=L1_test:\n\t"
" decl %%ecx\n\t"
" jge .%=Loop1\n\t"
" # We've handled the bulk of multiplies up to here.\n\t"
" # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
" # If so, we've got 2 more taps to do.\n\t"
" movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
" shrl $4, %%ecx\n\t"
" andl $1, %%ecx\n\t"
" je .%=Leven\n\t"
" # The count was odd, do 2 more taps.\n\t"
" # Note that we've already got mm0/mm2 preloaded\n\t"
" # from the main loop.\n\t"
" movaps %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" addps %%xmm2, %%xmm7\n\t"
".%=Leven:\n\t"
" # neg inversor\n\t"
" movl 8(%%ebp), %%eax \n\t"
" xorps %%xmm1, %%xmm1\n\t"
" movl $0x80000000, (%%eax)\n\t"
" movss (%%eax), %%xmm1\n\t"
" shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
" # pfpnacc\n\t"
" xorps %%xmm1, %%xmm6\n\t"
" movaps %%xmm6, %%xmm2\n\t"
" unpcklps %%xmm7, %%xmm6\n\t"
" unpckhps %%xmm7, %%xmm2\n\t"
" movaps %%xmm2, %%xmm3\n\t"
" shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
" shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
" addps %%xmm2, %%xmm6\n\t"
" # xmm6 = r1 i2 r3 i4\n\t"
" #movl 8(%%ebp), %%eax # @result\n\t"
" movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
" addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
" movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
" #popl %%ebp\n\t"
:
:
: "eax", "ecx", "edx"
);
int getem = num_bytes % 16;
if(isodd) {
*result += (input[num_points - 1] * taps[num_points - 1]);
}
return;
#endif
}
#endif /*LV_HAVE_SSE*/
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
const unsigned int num_bytes = num_points*8;
unsigned int isodd = num_points & 1;
lv_32fc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(float));
unsigned int number = 0;
const unsigned int halfPoints = num_bytes >> 4;
__m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
const lv_32fc_t* a = input;
const lv_32fc_t* b = taps;
dotProdVal = _mm_setzero_ps();
for(;number < halfPoints; number++){
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
dotProdVal = _mm_add_ps(dotProdVal, z); // Add the complex multiplication results together
a += 2;
b += 2;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
_mm_store_ps((float*)dotProductVector,dotProdVal); // Store the results back into the dot product vector
dotProduct += ( dotProductVector[0] + dotProductVector[1] );
if(isodd) {
dotProduct += input[num_points - 1] * taps[num_points - 1];
}
*result = dotProduct;
}
#endif /*LV_HAVE_SSE3*/
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
static inline void volk_gnsssdr_32fc_x2_dot_prod_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
unsigned int i = 0;
const unsigned int qtr_points = num_points/4;
const unsigned int isodd = num_points & 3;
__m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, real0, real1, im0, im1;
float *p_input, *p_taps;
__m64 *p_result;
static const __m128i neg = {0x000000000000000080000000};
p_result = (__m64*)result;
p_input = (float*)input;
p_taps = (float*)taps;
real0 = _mm_setzero_ps();
real1 = _mm_setzero_ps();
im0 = _mm_setzero_ps();
im1 = _mm_setzero_ps();
for(; i < qtr_points; ++i) {
xmm0 = _mm_load_ps(p_input);
xmm1 = _mm_load_ps(p_taps);
p_input += 4;
p_taps += 4;
xmm2 = _mm_load_ps(p_input);
xmm3 = _mm_load_ps(p_taps);
p_input += 4;
p_taps += 4;
xmm4 = _mm_unpackhi_ps(xmm0, xmm2);
xmm5 = _mm_unpackhi_ps(xmm1, xmm3);
xmm0 = _mm_unpacklo_ps(xmm0, xmm2);
xmm2 = _mm_unpacklo_ps(xmm1, xmm3);
//imaginary vector from input
xmm1 = _mm_unpackhi_ps(xmm0, xmm4);
//real vector from input
xmm3 = _mm_unpacklo_ps(xmm0, xmm4);
//imaginary vector from taps
xmm0 = _mm_unpackhi_ps(xmm2, xmm5);
//real vector from taps
xmm2 = _mm_unpacklo_ps(xmm2, xmm5);
xmm4 = _mm_dp_ps(xmm3, xmm2, 0xf1);
xmm5 = _mm_dp_ps(xmm1, xmm0, 0xf1);
xmm6 = _mm_dp_ps(xmm3, xmm0, 0xf2);
xmm7 = _mm_dp_ps(xmm1, xmm2, 0xf2);
real0 = _mm_add_ps(xmm4, real0);
real1 = _mm_add_ps(xmm5, real1);
im0 = _mm_add_ps(xmm6, im0);
im1 = _mm_add_ps(xmm7, im1);
}
real1 = _mm_xor_ps(real1, bit128_p(&neg)->float_vec);
im0 = _mm_add_ps(im0, im1);
real0 = _mm_add_ps(real0, real1);
im0 = _mm_add_ps(im0, real0);
_mm_storel_pi(p_result, im0);
for(i = num_points-isodd; i < num_points; i++) {
*result += input[i] * taps[i];
}
}
#endif /*LV_HAVE_SSE4_1*/
#endif /*INCLUDED_volk_gnsssdr_32fc_x2_dot_prod_32fc_a_H*/

View File

@ -0,0 +1,170 @@
#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x, y, yl, yh, z, tmp1, tmp2;
lv_32fc_t* c = cVector;
const lv_32fc_t* a = aVector;
const lv_32fc_t* b = bVector;
for(;number < halfPoints; number++){
x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
_mm_storeu_ps((float*)c,z); // Store the results back into the C container
a += 2;
b += 2;
c += 2;
}
if((num_points % 2) != 0) {
*c = (*a) * (*b);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
const lv_32fc_t* bPtr= bVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) * (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
#define INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_sse3(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
__m128 x, y, yl, yh, z, tmp1, tmp2;
lv_32fc_t* c = cVector;
const lv_32fc_t* a = aVector;
const lv_32fc_t* b = bVector;
for(;number < halfPoints; number++){
x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)b); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
_mm_store_ps((float*)c,z); // Store the results back into the C container
a += 2;
b += 2;
c += 2;
}
if((num_points % 2) != 0) {
*c = (*a) * (*b);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
lv_32fc_t* cPtr = cVector;
const lv_32fc_t* aPtr = aVector;
const lv_32fc_t* bPtr= bVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) * (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
extern void volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points);
static inline void volk_gnsssdr_32fc_x2_multiply_32fc_u_orc(lv_32fc_t* cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */

View File

@ -0,0 +1,409 @@
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
/*!
* TODO: Code the SSE4 version and benchmark it
*/
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_E;
memset(&dotProduct_E, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_P;
memset(&dotProduct_P, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_L;
memset(&dotProduct_L, 0x0, 2*sizeof(float));
// Aux vars
__m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
z_E = _mm_setzero_ps();
z_P = _mm_setzero_ps();
z_L = _mm_setzero_ps();
//input and output vectors
//lv_32fc_t* _input_BB = input_BB;
const lv_32fc_t* _input = input;
const lv_32fc_t* _carrier = carrier;
const lv_32fc_t* _E_code = E_code;
const lv_32fc_t* _P_code = P_code;
const lv_32fc_t* _L_code = L_code;
for(;number < halfPoints; number++)
{
// carrier wipe-off (vector point-to-point product)
x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
// correlation E,P,L (3x vector scalar product)
// Early
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
x = z;
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
// Late
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
/*pointer increment*/
_carrier += 2;
_input += 2;
//_input_BB += 2;
_E_code += 2;
_P_code += 2;
_L_code +=2;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
//__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
_mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
if((num_points % 2) != 0)
{
//_input_BB = (*_input) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
}
*E_out = dotProduct_E;
*P_out = dotProduct_P;
*L_out = dotProduct_L;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
{
lv_32fc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each
*E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i];
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_sse3(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_E;
memset(&dotProduct_E, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_P;
memset(&dotProduct_P, 0x0, 2*sizeof(float));
lv_32fc_t dotProduct_L;
memset(&dotProduct_L, 0x0, 2*sizeof(float));
// Aux vars
__m128 x, y, yl, yh, z, tmp1, tmp2, z_E, z_P, z_L;
z_E = _mm_setzero_ps();
z_P = _mm_setzero_ps();
z_L = _mm_setzero_ps();
//input and output vectors
//lv_32fc_t* _input_BB = input_BB;
const lv_32fc_t* _input = input;
const lv_32fc_t* _carrier = carrier;
const lv_32fc_t* _E_code = E_code;
const lv_32fc_t* _P_code = P_code;
const lv_32fc_t* _L_code = L_code;
for(;number < halfPoints; number++)
{
// carrier wipe-off (vector point-to-point product)
x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//_mm_storeu_ps((float*)_input_BB,z); // Store the results back into the _input_BB container
// correlation E,P,L (3x vector scalar product)
// Early
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
x = z;
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
// Late
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
/*pointer increment*/
_carrier += 2;
_input += 2;
//_input_BB += 2;
_E_code += 2;
_P_code += 2;
_L_code +=2;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
//__VOLK_ATTR_ALIGNED(16) lv_32fc_t _input_BB;
_mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
dotProduct_E += ( dotProductVector_E[0] + dotProductVector_E[1] );
dotProduct_P += ( dotProductVector_P[0] + dotProductVector_P[1] );
dotProduct_L += ( dotProductVector_L[0] + dotProductVector_L[1] );
if((num_points % 2) != 0)
{
//_input_BB = (*_input) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
}
*E_out = dotProduct_E;
*P_out = dotProduct_P;
*L_out = dotProduct_L;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, unsigned int num_points)
{
lv_32fc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each
*E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i];
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3_a_H */

View File

@ -0,0 +1,848 @@
/*!
* \file volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5
* \brief Volk protokernel: performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation with 64 bits vectors
* \authors <ul>
* <li>Javier Arribas, 2011. jarribas(at)cttc.es
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* VE, Early, Prompt, Late and VL correlation with 64 bits vectors (32 bits the
* real part and 32 bits the imaginary part):
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 64 bits vectors) It returns the input
* signal in base band (BB)
* - VE values are calculated by multiplying the input signal in BB by the
* VE code (multiplication of 64 bits vectors), accumulating the results
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 64 bits vectors), accumulating the results
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 64 bits vectors), accumulating the results
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 64 bits vectors), accumulating the results
* - VL values are calculated by multiplying the input signal in BB by the
* VL code (multiplication of 64 bits vectors), accumulating the results
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code VE PRN code replica input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param VL_code VL PRN code replica input
\param VE_out VE correlation output
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param VL_out VL correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 4;
lv_32fc_t dotProduct_VE;
lv_32fc_t dotProduct_E;
lv_32fc_t dotProduct_P;
lv_32fc_t dotProduct_L;
lv_32fc_t dotProduct_VL;
// Aux vars
__m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
__m256 bb_signal_sample, bb_signal_sample_shuffled;
z_VE = _mm256_setzero_ps();
z_E = _mm256_setzero_ps();
z_P = _mm256_setzero_ps();
z_L = _mm256_setzero_ps();
z_VL = _mm256_setzero_ps();
//input and output vectors
const lv_32fc_t* _input = input;
const lv_32fc_t* _carrier = carrier;
const lv_32fc_t* _VE_code = VE_code;
const lv_32fc_t* _E_code = E_code;
const lv_32fc_t* _P_code = P_code;
const lv_32fc_t* _L_code = L_code;
const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++)
{
// carrier wipe-off (vector point-to-point product)
x = _mm256_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm256_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product)
// VE
y = _mm256_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
// Early
y = _mm256_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt
y = _mm256_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
// Late
y = _mm256_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
// VL
y = _mm256_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/
_carrier += 4;
_input += 4;
_VE_code += 4;
_E_code += 4;
_P_code += 4;
_L_code += 4;
_VL_code += 4;
}
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
_mm256_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
_mm256_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
_mm256_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
_mm256_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
_mm256_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
for (int i = 0; i<(num_points % 4); ++i)
{
dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
}
*VE_out = dotProduct_VE;
*E_out = dotProduct_E;
*P_out = dotProduct_P;
*L_out = dotProduct_L;
*VL_out = dotProduct_VL;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code VE PRN code replica input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param VL_code VL PRN code replica input
\param VE_out VE correlation output
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param VL_out VL correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_VE;
lv_32fc_t dotProduct_E;
lv_32fc_t dotProduct_P;
lv_32fc_t dotProduct_L;
lv_32fc_t dotProduct_VL;
// Aux vars
__m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
__m128 bb_signal_sample, bb_signal_sample_shuffled;
z_VE = _mm_setzero_ps();
z_E = _mm_setzero_ps();
z_P = _mm_setzero_ps();
z_L = _mm_setzero_ps();
z_VL = _mm_setzero_ps();
//input and output vectors
const lv_32fc_t* _input = input;
const lv_32fc_t* _carrier = carrier;
const lv_32fc_t* _VE_code = VE_code;
const lv_32fc_t* _E_code = E_code;
const lv_32fc_t* _P_code = P_code;
const lv_32fc_t* _L_code = L_code;
const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++)
{
// carrier wipe-off (vector point-to-point product)
x = _mm_loadu_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product)
// VE
y = _mm_loadu_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
// Early
y = _mm_loadu_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt
y = _mm_loadu_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
// Late
y = _mm_loadu_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
// VL
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_loadu_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/
_carrier += 2;
_input += 2;
_VE_code += 2;
_E_code += 2;
_P_code += 2;
_L_code +=2;
_VL_code +=2;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2];
_mm_storeu_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
_mm_storeu_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
_mm_storeu_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
_mm_storeu_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
_mm_storeu_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
if((num_points % 2) != 0)
{
dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
}
*VE_out = dotProduct_VE;
*E_out = dotProduct_E;
*P_out = dotProduct_P;
*L_out = dotProduct_L;
*VL_out = dotProduct_VL;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code VE PRN code replica input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param VL_code VL PRN code replica input
\param VE_out VE correlation output
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param VL_out VL correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{
lv_32fc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each
*VE_out += bb_signal_sample * VE_code[i];
*E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i];
*VL_out += bb_signal_sample * VL_code[i];
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code VE PRN code replica input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param VL_code VL PRN code replica input
\param VE_out VE correlation output
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param VL_out VL correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_avx(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 4;
lv_32fc_t dotProduct_VE;
lv_32fc_t dotProduct_E;
lv_32fc_t dotProduct_P;
lv_32fc_t dotProduct_L;
lv_32fc_t dotProduct_VL;
// Aux vars
__m256 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
__m256 bb_signal_sample, bb_signal_sample_shuffled;
z_VE = _mm256_setzero_ps();
z_E = _mm256_setzero_ps();
z_P = _mm256_setzero_ps();
z_L = _mm256_setzero_ps();
z_VL = _mm256_setzero_ps();
//input and output vectors
const lv_32fc_t* _input = input;
const lv_32fc_t* _carrier = carrier;
const lv_32fc_t* _VE_code = VE_code;
const lv_32fc_t* _E_code = E_code;
const lv_32fc_t* _P_code = P_code;
const lv_32fc_t* _L_code = L_code;
const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++)
{
// carrier wipe-off (vector point-to-point product)
x = _mm256_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm256_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm256_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product)
// VE
y = _mm256_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm256_add_ps(z_VE, z); // Add the complex multiplication results together
// Early
y = _mm256_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm256_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt
y = _mm256_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm256_add_ps(z_P, z); // Add the complex multiplication results together
// Late
y = _mm256_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm256_add_ps(z_L, z); // Add the complex multiplication results together
// VL
y = _mm256_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm256_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm256_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm256_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/
_carrier += 4;
_input += 4;
_VE_code += 4;
_E_code += 4;
_P_code += 4;
_L_code += 4;
_VL_code += 4;
}
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VE[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_E[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_P[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_L[4];
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector_VL[4];
_mm256_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] + dotProductVector_VE[2] + dotProductVector_VE[3] );
dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] + dotProductVector_E[2] + dotProductVector_E[3] );
dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] + dotProductVector_P[2] + dotProductVector_P[3] );
dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] + dotProductVector_L[2] + dotProductVector_L[3] );
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] + dotProductVector_VL[2] + dotProductVector_VL[3] );
for (int i = 0; i<(num_points % 4); ++i)
{
dotProduct_VE += (*_input) * (*_VE_code++) * (*_carrier);
dotProduct_E += (*_input) * (*_E_code++) * (*_carrier);
dotProduct_P += (*_input) * (*_P_code++) * (*_carrier);
dotProduct_L += (*_input) * (*_L_code++) * (*_carrier);
dotProduct_VL += (*_input++) * (*_VL_code++) * (*_carrier++);
}
*VE_out = dotProduct_VE;
*E_out = dotProduct_E;
*P_out = dotProduct_P;
*L_out = dotProduct_L;
*VL_out = dotProduct_VL;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code VE PRN code replica input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param VL_code VL PRN code replica input
\param VE_out VE correlation output
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param VL_out VL correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_sse3(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{
unsigned int number = 0;
const unsigned int halfPoints = num_points / 2;
lv_32fc_t dotProduct_VE;
lv_32fc_t dotProduct_E;
lv_32fc_t dotProduct_P;
lv_32fc_t dotProduct_L;
lv_32fc_t dotProduct_VL;
// Aux vars
__m128 x, y, yl, yh, z, tmp1, tmp2, z_VE, z_E, z_P, z_L, z_VL;
__m128 bb_signal_sample, bb_signal_sample_shuffled;
z_VE = _mm_setzero_ps();
z_E = _mm_setzero_ps();
z_P = _mm_setzero_ps();
z_L = _mm_setzero_ps();
z_VL = _mm_setzero_ps();
//input and output vectors
const lv_32fc_t* _input = input;
const lv_32fc_t* _carrier = carrier;
const lv_32fc_t* _VE_code = VE_code;
const lv_32fc_t* _E_code = E_code;
const lv_32fc_t* _P_code = P_code;
const lv_32fc_t* _L_code = L_code;
const lv_32fc_t* _VL_code = VL_code;
for(;number < halfPoints; number++)
{
// carrier wipe-off (vector point-to-point product)
x = _mm_load_ps((float*)_input); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_carrier); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
bb_signal_sample = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
bb_signal_sample_shuffled = _mm_shuffle_ps(bb_signal_sample,bb_signal_sample,0xB1); // Re-arrange bb_signal_sample to be ai,ar,bi,br
// correlation VE,E,P,L,VL (5x vector scalar product)
// VE
y = _mm_load_ps((float*)_VE_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VE = _mm_add_ps(z_VE, z); // Add the complex multiplication results together
// Early
y = _mm_load_ps((float*)_E_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_E = _mm_add_ps(z_E, z); // Add the complex multiplication results together
// Prompt
y = _mm_load_ps((float*)_P_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_P = _mm_add_ps(z_P, z); // Add the complex multiplication results together
// Late
y = _mm_load_ps((float*)_L_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_L = _mm_add_ps(z_L, z); // Add the complex multiplication results together
// VL
//x = _mm_load_ps((float*)_input_BB); // Load the ar + ai, br + bi as ar,ai,br,bi
y = _mm_load_ps((float*)_VL_code); // Load the cr + ci, dr + di as cr,ci,dr,di
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(bb_signal_sample,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp2 = _mm_mul_ps(bb_signal_sample_shuffled,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
z = _mm_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
z_VL = _mm_add_ps(z_VL, z); // Add the complex multiplication results together
/*pointer increment*/
_carrier += 2;
_input += 2;
_VE_code += 2;
_E_code += 2;
_P_code += 2;
_L_code +=2;
_VL_code +=2;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VE[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_E[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_P[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_L[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector_VL[2];
_mm_store_ps((float*)dotProductVector_VE,z_VE); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_E,z_E); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_P,z_P); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_L,z_L); // Store the results back into the dot product vector
_mm_store_ps((float*)dotProductVector_VL,z_VL); // Store the results back into the dot product vector
dotProduct_VE = ( dotProductVector_VE[0] + dotProductVector_VE[1] );
dotProduct_E = ( dotProductVector_E[0] + dotProductVector_E[1] );
dotProduct_P = ( dotProductVector_P[0] + dotProductVector_P[1] );
dotProduct_L = ( dotProductVector_L[0] + dotProductVector_L[1] );
dotProduct_VL = ( dotProductVector_VL[0] + dotProductVector_VL[1] );
if((num_points % 2) != 0)
{
dotProduct_VE += (*_input) * (*_VE_code)*(*_carrier);
dotProduct_E += (*_input) * (*_E_code)*(*_carrier);
dotProduct_P += (*_input) * (*_P_code)*(*_carrier);
dotProduct_L += (*_input) * (*_L_code)*(*_carrier);
dotProduct_VL += (*_input) * (*_VL_code)*(*_carrier);
}
*VE_out = dotProduct_VE;
*E_out = dotProduct_E;
*P_out = dotProduct_P;
*L_out = dotProduct_L;
*VL_out = dotProduct_VL;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the VE, Early, Prompt, Late and VL correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code VE PRN code replica input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param VL_code VL PRN code replica input
\param VE_out VE correlation output
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param VL_out VL correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_32fc_t* input, const lv_32fc_t* carrier, const lv_32fc_t* VE_code, const lv_32fc_t* E_code, const lv_32fc_t* P_code, const lv_32fc_t* L_code, const lv_32fc_t* VL_code, unsigned int num_points)
{
lv_32fc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each
*VE_out += bb_signal_sample * VE_code[i];
*E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i];
*VL_out += bb_signal_sample * VL_code[i];
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5_a_H */

View File

@ -0,0 +1,243 @@
/*!
* \file volk_gnsssdr_64f_accumulator_64f.h
* \brief Volk protokernel: 64 bits (double) scalar accumulator
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that implements an accumulator of char values
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result,const double* inputBuffer, unsigned int num_points){
double returnValue = 0;
const unsigned int sse_iters = num_points / 4;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm256_loadu_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal);
aPtr += 4;
}
_mm256_storeu_pd((double*)tempBuffer,accumulator);
for(int i = 0; i<4; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 4); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <xmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points){
double returnValue = 0;
const unsigned int sse_iters = num_points / 2;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm_loadu_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal);
aPtr += 2;
}
_mm_storeu_pd((double*)tempBuffer,accumulator);
for(int i = 0; i<2; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 2); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points){
const double* aPtr = inputBuffer;
double returnValue = 0;
for(unsigned int number = 0;number < num_points; number++){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_u_H */
#ifndef INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
#define INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <immintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points){
double returnValue = 0;
const unsigned int sse_iters = num_points / 4;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm256_load_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal);
aPtr += 4;
}
_mm256_store_pd((double*)tempBuffer,accumulator);
for(int i = 0; i<4; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 4); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE3
#include <xmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points){
double returnValue = 0;
const unsigned int sse_iters = num_points / 2;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd();
for(unsigned int number = 0; number < sse_iters; number++)
{
aVal = _mm_load_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal);
aPtr += 2;
}
_mm_store_pd((double*)tempBuffer,accumulator);
for(int i = 0; i<2; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 2); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_64f_accumulator_64f_a_generic(double* result,const double* inputBuffer, unsigned int num_points){
const double* aPtr = inputBuffer;
double returnValue = 0;
for(unsigned int number = 0;number < num_points; number++){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_64f_accumulator_64f_a_H */

View File

@ -0,0 +1,183 @@
/*!
* \file volk_gnsssdr_8i_accumulator_s8i.h
* \brief Volk protokernel: 8 bits (char) scalar accumulator
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that implements an accumulator of char values
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H
#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE3
#include <xmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const char* inputBuffer, unsigned int num_points){
char returnValue = 0;
const unsigned int sse_iters = num_points / 16;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++){
aVal = _mm_lddqu_si128((__m128i*)aPtr);
accumulator = _mm_add_epi8(accumulator, aVal);
aPtr += 16;
}
_mm_storeu_si128((__m128i*)tempBuffer,accumulator);
for(int i = 0; i<16; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 16); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const char* inputBuffer, unsigned int num_points){
const char* aPtr = inputBuffer;
char returnValue = 0;
for(unsigned int number = 0;number < num_points; number++){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_u_H */
#ifndef INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H
#define INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE3
#include <xmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const char* inputBuffer, unsigned int num_points){
char returnValue = 0;
const unsigned int sse_iters = num_points / 16;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128();
for(unsigned int number = 0; number < sse_iters; number++){
aVal = _mm_load_si128((__m128i*)aPtr);
accumulator = _mm_add_epi8(accumulator, aVal);
aPtr += 16;
}
_mm_store_si128((__m128i*)tempBuffer,accumulator);
for(int i = 0; i<16; ++i){
returnValue += tempBuffer[i];
}
for(int i = 0; i<(num_points % 16); ++i){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_8i_accumulator_s8i_a_generic(char* result, const char* inputBuffer, unsigned int num_points){
const char* aPtr = inputBuffer;
char returnValue = 0;
for(unsigned int number = 0;number < num_points; number++){
returnValue += (*aPtr++);
}
*result = returnValue;
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
extern void volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(short* result, const char* inputBuffer, unsigned int num_points);
static inline void volk_gnsssdr_8i_accumulator_s8i_u_orc(char* result, const char* inputBuffer, unsigned int num_points){
short res = 0;
char* resc = (char*)&res;
resc++;
volk_gnsssdr_8i_accumulator_s8i_a_orc_impl(&res, inputBuffer, num_points);
*result = *resc;
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_8i_accumulator_s8i_a_H */

View File

@ -0,0 +1,493 @@
/*!
* \file volk_gnsssdr_8i_index_max_16u.h
* \brief Volk protokernel: calculates the index of the maximum value in a group of 8 bits (char) scalars
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that returns the index of the maximum value of a group of 8 bits (char) scalars
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H
#define INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_AVX
#include "immintrin.h"
/*!
\brief Returns the index of the max value in src0
\param target The index of the max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 32;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__m256i ones, compareResults, currentValues;
__m128i compareResultslo, compareResultshi, maxValues, lo, hi;
ones = _mm256_set1_epi8(0xFF);
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
lo = _mm256_castsi256_si128(currentValues);
hi = _mm256_extractf128_si256(currentValues,1);
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
if (!_mm256_testc_si256(compareResults, ones))
{
_mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
for(int i = 0; i < 32; i++)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 32;
}
for(int i = 0; i<(num_points % 32); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_AVX*/
#ifdef LV_HAVE_SSE4_1
#include<smmintrin.h>
/*!
\brief Returns the index of the max value in src0
\param target The index of the max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 16;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
if (!_mm_test_all_ones(compareResults))
{
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
for(int i = 0; i < 16; i++)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE2
#include<xmmintrin.h>
/*!
\brief Returns the index of the max value in src0
\param target The index of the max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 16;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
if (mask != 0xFFFF)
{
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
mask = ~mask;
int i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_SSE2*/
#ifdef LV_HAVE_GENERIC
/*!
\brief Returns the index of the max value in src0
\param target The index of the max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) {
if(num_points > 0)
{
char max = src0[0];
unsigned int index = 0;
for(unsigned int i = 1; i < num_points; ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_GENERIC*/
#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_u_H*/
#ifndef INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H
#define INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_AVX
#include "immintrin.h"
/*!
\brief Returns the index of the max value in src0
\param target The index of the max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 32;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__m256i ones, compareResults, currentValues;
__m128i compareResultslo, compareResultshi, maxValues, lo, hi;
ones = _mm256_set1_epi8(0xFF);
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
lo = _mm256_castsi256_si128(currentValues);
hi = _mm256_extractf128_si256(currentValues,1);
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
if (!_mm256_testc_si256(compareResults, ones))
{
_mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
for(int i = 0; i < 32; i++)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 32;
}
for(int i = 0; i<(num_points % 32); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_AVX*/
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "emmintrin.h"
/*!
\brief Returns the index of the max value in src0
\param target The index of the max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 16;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
if (!_mm_test_all_ones(compareResults))
{
_mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
for(int i = 0; i < 16; i++)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
/*!
\brief Returns the index of the max value in src0
\param target The index of the max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 16;
char* basePtr = (char*)src0;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
if (mask != 0xFFFF)
{
_mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
mask = ~mask;
int i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_SSE2*/
#ifdef LV_HAVE_GENERIC
/*!
\brief Returns the index of the max value in src0
\param target The index of the max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_index_max_16u_a_generic(unsigned int* target, const char* src0, unsigned int num_points) {
if(num_points > 0)
{
char max = src0[0];
unsigned int index = 0;
for(unsigned int i = 1; i < num_points; ++i)
{
if(src0[i] > max)
{
index = i;
max = src0[i];
}
}
target[0] = index;
}
}
#endif /*LV_HAVE_GENERIC*/
#endif /*INCLUDED_volk_gnsssdr_8i_index_max_16u_a_H*/

View File

@ -0,0 +1,327 @@
/*!
* \file volk_gnsssdr_8i_max_s8i.h
* \brief Volk protokernel: calculates the maximum value in a group of 8 bits (char) scalars
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that returns the maximum value of a group of 8 bits (char) scalars
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_u_H
#define INCLUDED_volk_gnsssdr_8i_max_s8i_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE4_1
#include<smmintrin.h>
/*!
\brief Returns the max value in src0
\param target The max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 16;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 16;
}
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
for(int i = 0; i<16; ++i)
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE2
#include<xmmintrin.h>
/*!
\brief Returns the max value in src0
\param target The max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 16;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
if (mask != 0xFFFF)
{
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
mask = ~mask;
int i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
}
}
#endif /*LV_HAVE_SSE2*/
#ifdef LV_HAVE_GENERIC
/*!
\brief Returns the max value in src0
\param target The max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_max_s8i_generic(char target, const char* src0, unsigned int num_points) {
if(num_points > 0)
{
char max = src0[0];
for(unsigned int i = 1; i < num_points; ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
}
}
#endif /*LV_HAVE_GENERIC*/
#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_u_H*/
#ifndef INCLUDED_volk_gnsssdr_8i_max_s8i_a_H
#define INCLUDED_volk_gnsssdr_8i_max_s8i_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
/*!
\brief Returns the max value in src0
\param target The max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 16;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 16;
}
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
for(int i = 0; i<16; ++i)
{
if(maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
}
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
/*!
\brief Returns the max value in src0
\param target The max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char target, const char* src0, unsigned int num_points) {
if(num_points > 0){
const unsigned int sse_iters = num_points / 16;
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(unsigned int number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
if (mask != 0xFFFF)
{
_mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
mask = ~mask;
int i = 0;
while (mask > 0)
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
{
max = currentValuesBuffer[i];
}
}
i++;
mask >>= 1;
}
maxValues = _mm_set1_epi8(max);
}
inputPtr += 16;
}
for(int i = 0; i<(num_points % 16); ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
}
}
#endif /*LV_HAVE_SSE2*/
#ifdef LV_HAVE_GENERIC
/*!
\brief Returns the max value in src0
\param target The max value in src0
\param src0 The buffer of data to be analysed
\param num_points The number of values in src0 to be analysed
*/
static inline void volk_gnsssdr_8i_max_s8i_a_generic(char target, const char* src0, unsigned int num_points) {
if(num_points > 0)
{
if(num_points > 0)
{
char max = src0[0];
for(unsigned int i = 1; i < num_points; ++i)
{
if(src0[i] > max)
{
max = src0[i];
}
}
target = max;
}
}
}
#endif /*LV_HAVE_GENERIC*/
#endif /*INCLUDED_volk_gnsssdr_8i_max_s8i_a_H*/

View File

@ -0,0 +1,184 @@
/*!
* \file volk_gnsssdr_8i_x2_add_8i.h
* \brief Volk protokernel: adds pairs of 8 bits (char) scalars
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that adds pairs of 8 bits (char) scalars
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H
#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE2
#include "pmmintrin.h"
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 16;
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr= bVector;
__m128i aVal, bVal, cVal;
for(int number = 0; number < sse_iters; number++){
aVal = _mm_lddqu_si128((__m128i*)aPtr);
bVal = _mm_lddqu_si128((__m128i*)bPtr);
cVal = _mm_add_epi8(aVal, bVal);
_mm_storeu_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
aPtr += 16;
bPtr += 16;
cPtr += 16;
}
for(int i = 0; i<(num_points % 16); ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr= bVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_u_H */
#ifndef INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H
#define INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE2
#include "pmmintrin.h"
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 16;
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr= bVector;
__m128i aVal, bVal, cVal;
for(int number = 0; number < sse_iters; number++){
aVal = _mm_load_si128((__m128i*)aPtr);
bVal = _mm_load_si128((__m128i*)bPtr);
cVal = _mm_add_epi8(aVal, bVal);
_mm_store_si128((__m128i*)cPtr,cVal); // Store the results back into the C container
aPtr += 16;
bPtr += 16;
cPtr += 16;
}
for(int i = 0; i<(num_points % 16); ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
static inline void volk_gnsssdr_8i_x2_add_8i_a_generic(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
char* cPtr = cVector;
const char* aPtr = aVector;
const char* bPtr= bVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) + (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Adds the two input vectors and store their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be added
\param bVector One of the vectors to be added
\param num_points The number of values in aVector and bVector to be added together and stored into cVector
*/
extern void volk_gnsssdr_8i_x2_add_8i_a_orc_impl(char* cVector, const char* aVector, const char* bVector, unsigned int num_points);
static inline void volk_gnsssdr_8i_x2_add_8i_u_orc(char* cVector, const char* aVector, const char* bVector, unsigned int num_points){
volk_gnsssdr_8i_x2_add_8i_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_8i_x2_add_8i_a_H */

View File

@ -0,0 +1,326 @@
/*!
* \file volk_gnsssdr_8ic_conjugate_8ic.h
* \brief Volk protokernel: calculates the conjugate of a 16 bits vector
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that calculates the conjugate of a
* 16 bits vector (8 bits the real part and 8 bits the imaginary part)
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H
#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#ifdef LV_HAVE_AVX
#include "immintrin.h"
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 16;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m256 tmp;
__m128i tmp128lo, tmp128hi;
__m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (int i = 0; i < sse_iters; ++i)
{
tmp = _mm256_loadu_ps((float*)a);
tmp = _mm256_xor_ps(tmp, conjugator1);
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1);
_mm256_storeu_ps((float*)c, tmp);
a += 16;
c += 16;
}
for (int i = 0; i<(num_points % 16); ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSSE3
#include "tmmintrin.h"
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m128i tmp;
__m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (int i = 0; i < sse_iters; ++i)
{
tmp = _mm_lddqu_si128((__m128i*)a);
tmp = _mm_sign_epi8(tmp, conjugator);
_mm_storeu_si128((__m128i*)c, tmp);
a += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m128i tmp;
__m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (int i = 0; i < sse_iters; ++i)
{
tmp = _mm_lddqu_si128((__m128i*)a);
tmp = _mm_xor_si128(tmp, conjugator1);
tmp = _mm_add_epi8(tmp, conjugator2);
_mm_storeu_si128((__m128i*)c, tmp);
a += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = lv_conj(*aPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_u_H */
#ifndef INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H
#define INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#ifdef LV_HAVE_AVX
#include "immintrin.h"
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 16;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m256 tmp;
__m128i tmp128lo, tmp128hi;
__m256 conjugator1 = _mm256_castsi256_ps(_mm256_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255));
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (int i = 0; i < sse_iters; ++i)
{
tmp = _mm256_load_ps((float*)a);
tmp = _mm256_xor_ps(tmp, conjugator1);
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1);
_mm256_store_ps((float*)c, tmp);
a += 16;
c += 16;
}
for (int i = 0; i<(num_points % 16); ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSSE3
#include "tmmintrin.h"
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_ssse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m128i tmp;
__m128i conjugator = _mm_setr_epi8(1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1);
for (int i = 0; i < sse_iters; ++i)
{
tmp = _mm_load_si128((__m128i*)a);
tmp = _mm_sign_epi8(tmp, conjugator);
_mm_store_si128((__m128i*)c, tmp);
a += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_SSSE3 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
__m128i tmp;
__m128i conjugator1 = _mm_setr_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
__m128i conjugator2 = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1);
for (int i = 0; i < sse_iters; ++i)
{
tmp = _mm_load_si128((__m128i*)a);
tmp = _mm_xor_si128(tmp, conjugator1);
tmp = _mm_add_epi8(tmp, conjugator2);
_mm_store_si128((__m128i*)c, tmp);
a += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_conjugate_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
*cPtr++ = lv_conj(*aPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Takes the conjugate of an unsigned char vector.
\param cVector The vector where the results will be stored
\param aVector Vector to be conjugated
\param num_points The number of unsigned char values in aVector to be conjugated and stored into cVector
*/
extern void volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points);
static inline void volk_gnsssdr_8ic_conjugate_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, unsigned int num_points){
volk_gnsssdr_8ic_conjugate_8ic_a_orc_impl(cVector, aVector, num_points);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_8ic_conjugate_8ic_a_H */

View File

@ -0,0 +1,320 @@
/*!
* \file volk_gnsssdr_8ic_magnitude_squared_8i.h
* \brief Volk protokernel: calculates the magnitude squared of a 16 bits vector
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that calculates the magnitude squared of a
* 16 bits vector (8 bits the real part and 8 bits the imaginary part)
* result = (real*real) + (imag*imag)
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H
#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_u_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
#include "tmmintrin.h"
/*!
\brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 16;
const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector;
__m128i zero, result8;
__m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
__m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
zero = _mm_setzero_si128();
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(int number = 0;number < sse_iters; number++)
{
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero);
avectorhi = _mm_unpackhi_epi8 (avector, zero);
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
complexVectorPtr += 16;
bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
complexVectorPtr += 16;
result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
_mm_storeu_si128((__m128i*)magnitudeVectorPtr, result8);
magnitudeVectorPtr += 16;
}
for (int i = 0; i<(num_points % 16); ++i)
{
const char valReal = *complexVectorPtr++;
const char valImag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
}
}
#endif /* LV_HAVE_SSE3 */
//#ifdef LV_HAVE_SSE
//#include <xmmintrin.h>
///*!
// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
// \param complexVector The vector containing the complex input values
// \param magnitudeVector The vector containing the real output values
// \param num_points The number of complex values in complexVector to be calculated and stored into cVector
// */
//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
// unsigned int number = 0;
// const unsigned int quarterPoints = num_points / 4;
//
// const float* complexVectorPtr = (float*)complexVector;
// float* magnitudeVectorPtr = magnitudeVector;
//
// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
// for(;number < quarterPoints; number++){
// cplxValue1 = _mm_loadu_ps(complexVectorPtr);
// complexVectorPtr += 4;
//
// cplxValue2 = _mm_loadu_ps(complexVectorPtr);
// complexVectorPtr += 4;
//
// // Arrange in i1i2i3i4 format
// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
// // Arrange in q1q2q3q4 format
// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
//
// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
//
// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
//
// _mm_storeu_ps(magnitudeVectorPtr, result);
// magnitudeVectorPtr += 4;
// }
//
// number = quarterPoints * 4;
// for(; number < num_points; number++){
// float val1Real = *complexVectorPtr++;
// float val1Imag = *complexVectorPtr++;
// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
// }
//}
//#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector;
for(int number = 0; number < num_points; number++){
const char real = *complexVectorPtr++;
const char imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_u_H */
#ifndef INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H
#define INCLUDED_volk_gnsssdr_8ic_magnitude_squared_8i_a_H
#include <inttypes.h>
#include <stdio.h>
#include <math.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 16;
const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector;
__m128i zero, result8;
__m128i avector, avectorhi, avectorlo, avectorlomult, avectorhimult, aadded, maska;
__m128i bvector, bvectorhi, bvectorlo, bvectorlomult, bvectorhimult, badded, maskb;
zero = _mm_setzero_si128();
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(int number = 0;number < sse_iters; number++)
{
avector = _mm_load_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero);
avectorhi = _mm_unpackhi_epi8 (avector, zero);
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
complexVectorPtr += 16;
bvector = _mm_load_si128((__m128i*)complexVectorPtr);
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
complexVectorPtr += 16;
result8 = _mm_or_si128(_mm_shuffle_epi8(aadded, maska), _mm_shuffle_epi8(badded, maskb));
_mm_store_si128((__m128i*)magnitudeVectorPtr, result8);
magnitudeVectorPtr += 16;
}
for (int i = 0; i<(num_points % 16); ++i)
{
const char valReal = *complexVectorPtr++;
const char valImag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (valReal * valReal) + (valImag * valImag);
}
}
#endif /* LV_HAVE_SSE3 */
//#ifdef LV_HAVE_SSE
//#include <xmmintrin.h>
///*!
// \brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
// \param complexVector The vector containing the complex input values
// \param magnitudeVector The vector containing the real output values
// \param num_points The number of complex values in complexVector to be calculated and stored into cVector
// */
//static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse(float* magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
// unsigned int number = 0;
// const unsigned int quarterPoints = num_points / 4;
//
// const float* complexVectorPtr = (float*)complexVector;
// float* magnitudeVectorPtr = magnitudeVector;
//
// __m128 cplxValue1, cplxValue2, iValue, qValue, result;
// for(;number < quarterPoints; number++){
// cplxValue1 = _mm_load_ps(complexVectorPtr);
// complexVectorPtr += 4;
//
// cplxValue2 = _mm_load_ps(complexVectorPtr);
// complexVectorPtr += 4;
//
// // Arrange in i1i2i3i4 format
// iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
// // Arrange in q1q2q3q4 format
// qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
//
// iValue = _mm_mul_ps(iValue, iValue); // Square the I values
// qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
//
// result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
//
// _mm_store_ps(magnitudeVectorPtr, result);
// magnitudeVectorPtr += 4;
// }
//
// number = quarterPoints * 4;
// for(; number < num_points; number++){
// float val1Real = *complexVectorPtr++;
// float val1Imag = *complexVectorPtr++;
// *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
// }
//}
//#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_generic(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector;
for(int number = 0; number < num_points; number++){
const char real = *complexVectorPtr++;
const char imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Calculates the magnitude squared of complexVector and stores the results in magnitudeVector
\param complexVector The vector containing the complex input values
\param magnitudeVector The vector containing the real output values
\param num_points The number of complex values in complexVector to be calculated and stored into cVector
*/
extern void volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points);
static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_orc(char* magnitudeVector, const lv_8sc_t* complexVector, unsigned int num_points){
volk_gnsssdr_8ic_magnitude_squared_8i_a_orc_impl(magnitudeVector, complexVector, num_points);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_32fc_magnitude_32f_a_H */

View File

@ -0,0 +1,271 @@
/*!
* \file volk_gnsssdr_8ic_s8ic_multiply_8ic.h
* \brief Volk protokernel: multiplies a group of 16 bits vectors by one constant vector
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that multiplies a group of 16 bits vectors
* (8 bits the real part and 8 bits the imaginary part) by one constant vector
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H
#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Multiplies the input vector by a scalar and stores the results in the third vector
\param cVector The vector where the results will be stored
\param aVector The vector to be multiplied
\param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
y = _mm_set1_epi16 (*(short*)&scalar);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
for(int number = 0;number < sse_iters; number++){
x = _mm_lddqu_si128((__m128i*)a);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
realc = _mm_and_si128 (realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_or_si128 (realc, imagc);
_mm_storeu_si128((__m128i*)c, totalc);
a += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = (*a++) * scalar;
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the input vector by a scalar and stores the results in the third vector
\param cVector The vector where the results will be stored
\param aVector The vector to be multiplied
\param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
/*lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
for (int i = 0; i<num_points; ++i)
{
*cPtr++ = (*aPtr++) * scalar;
}*/
lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
unsigned int number = num_points;
// unwrap loop
while (number >= 8){
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
number -= 8;
}
// clean up any remaining
while (number-- > 0)
*cPtr++ = *aPtr++ * scalar;
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_u_H */
#ifndef INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H
#define INCLUDED_volk_gnsssdr_8ic_s8ic_multiply_8ic_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
\brief Multiplies the input vector by a scalar and stores the results in the third vector
\param cVector The vector where the results will be stored
\param aVector The vector to be multiplied
\param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
y = _mm_set1_epi16 (*(short*)&scalar);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
for(int number = 0;number < sse_iters; number++){
x = _mm_load_si128((__m128i*)a);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
realc = _mm_and_si128 (realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_or_si128 (realc, imagc);
_mm_store_si128((__m128i*)c, totalc);
a += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = (*a++) * scalar;
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the input vector by a scalar and stores the results in the third vector
\param cVector The vector where the results will be stored
\param aVector The vector to be multiplied
\param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
/*lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
for (int i = 0; i<num_points; ++i)
{
*cPtr++ = (*aPtr++) * scalar;
}*/
lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
unsigned int number = num_points;
// unwrap loop
while (number >= 8){
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
*cPtr++ = (*aPtr++) * scalar;
number -= 8;
}
// clean up any remaining
while (number-- > 0)
*cPtr++ = *aPtr++ * scalar;
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Multiplies the input vector by a scalar and stores the results in the third vector
\param cVector The vector where the results will be stored
\param aVector The vector to be multiplied
\param scalar The complex scalar to multiply aVector
\param num_points The number of complex values in aVector to be multiplied by sacalar and stored into cVector
*/
extern void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const char scalarreal, const char scalarimag, unsigned int num_points);
static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t scalar, unsigned int num_points){
volk_gnsssdr_8ic_s8ic_multiply_8ic_a_orc_impl(cVector, aVector, lv_creal(scalar), lv_cimag(scalar), num_points);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_multiply_32fc_a_H */

View File

@ -0,0 +1,499 @@
/*!
* \file volk_gnsssdr_8ic_x2_dot_prod_8ic.h
* \brief Volk protokernel: multiplies two 16 bits vectors and accumulates them
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
* and 8 bits the imaginary part) and accumulates them
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H
#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <stdio.h>
#include <string.h>
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
/*lv_8sc_t* cPtr = result;
const lv_8sc_t* aPtr = input;
const lv_8sc_t* bPtr = taps;
for(int number = 0; number < num_points; number++){
*cPtr += (*aPtr++) * (*bPtr++);
}*/
char * res = (char*) result;
char * in = (char*) input;
char * tp = (char*) taps;
unsigned int n_2_ccomplex_blocks = num_points/2;
unsigned int isodd = num_points & 1;
char sum0[2] = {0,0};
char sum1[2] = {0,0};
unsigned int i = 0;
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
in += 4;
tp += 4;
}
res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1];
// Cleanup if we had an odd number of points
for(i = 0; i < isodd; ++i) {
*result += input[num_points - 1] * taps[num_points - 1];
}
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
const lv_8sc_t* a = input;
const lv_8sc_t* b = taps;
const unsigned int sse_iters = num_points/8;
if (sse_iters>0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(int number = 0; number < sse_iters; number++){
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
realcacc = _mm_and_si128 (realcacc, mult1);
imagcacc = _mm_and_si128 (imagcacc, mult1);
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_or_si128 (realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (int i = 0; i<8; ++i)
{
dotProduct += dotProductVector[i];
}
}
for (int i = 0; i<(num_points % 8); ++i)
{
dotProduct += (*a++) * (*b++);
}
*result = dotProduct;
}
#endif /*LV_HAVE_SSE2*/
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
const lv_8sc_t* a = input;
const lv_8sc_t* b = taps;
const unsigned int sse_iters = num_points/8;
if (sse_iters>0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(int number = 0; number < sse_iters; number++){
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (int i = 0; i<8; ++i)
{
dotProduct += dotProductVector[i];
}
}
for (int i = 0; i<(num_points % 8); ++i)
{
dotProduct += (*a++) * (*b++);
}
*result = dotProduct;
}
#endif /*LV_HAVE_SSE4_1*/
#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_u_H*/
#ifndef INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H
#define INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <stdio.h>
#include <string.h>
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_generic(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
/*lv_8sc_t* cPtr = result;
const lv_8sc_t* aPtr = input;
const lv_8sc_t* bPtr = taps;
for(int number = 0; number < num_points; number++){
*cPtr += (*aPtr++) * (*bPtr++);
}*/
char * res = (char*) result;
char * in = (char*) input;
char * tp = (char*) taps;
unsigned int n_2_ccomplex_blocks = num_points/2;
unsigned int isodd = num_points & 1;
char sum0[2] = {0,0};
char sum1[2] = {0,0};
unsigned int i = 0;
for(i = 0; i < n_2_ccomplex_blocks; ++i) {
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
sum1[0] += in[2] * tp[2] - in[3] * tp[3];
sum1[1] += in[2] * tp[3] + in[3] * tp[2];
in += 4;
tp += 4;
}
res[0] = sum0[0] + sum1[0];
res[1] = sum0[1] + sum1[1];
// Cleanup if we had an odd number of points
for(i = 0; i < isodd; ++i) {
*result += input[num_points - 1] * taps[num_points - 1];
}
}
#endif /*LV_HAVE_GENERIC*/
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
const lv_8sc_t* a = input;
const lv_8sc_t* b = taps;
const unsigned int sse_iters = num_points/8;
if (sse_iters>0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(int number = 0; number < sse_iters; number++){
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
realcacc = _mm_and_si128 (realcacc, mult1);
imagcacc = _mm_and_si128 (imagcacc, mult1);
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_or_si128 (realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (int i = 0; i<8; ++i)
{
dotProduct += dotProductVector[i];
}
}
for (int i = 0; i<(num_points % 8); ++i)
{
dotProduct += (*a++) * (*b++);
}
*result = dotProduct;
}
#endif /*LV_HAVE_SSE2*/
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points) {
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
const lv_8sc_t* a = input;
const lv_8sc_t* b = taps;
const unsigned int sse_iters = num_points/8;
if (sse_iters>0)
{
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc, realcacc, imagcacc;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(int number = 0; number < sse_iters; number++){
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
realcacc = _mm_add_epi16 (realcacc, realc);
imagcacc = _mm_add_epi16 (imagcacc, imagc);
a += 8;
b += 8;
}
imagcacc = _mm_slli_si128 (imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector,totalc); // Store the results back into the dot product vector
for (int i = 0; i<8; ++i)
{
dotProduct += dotProductVector[i];
}
}
for (int i = 0; i<(num_points % 8); ++i)
{
dotProduct += (*a++) * (*b++);
}
*result = dotProduct;
}
#endif /*LV_HAVE_SSE4_1*/
#ifdef LV_HAVE_ORC
/*!
\brief Multiplies the two input complex vectors and accumulates them, storing the result in the third vector
\param cVector The vector where the accumulated result will be stored
\param aVector One of the vectors to be multiplied and accumulated
\param bVector One of the vectors to be multiplied and accumulated
\param num_points The number of complex values in aVector and bVector to be multiplied together, accumulated and stored into cVector
*/
extern void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(short* resRealShort, short* resImagShort, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points);
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, const lv_8sc_t* input, const lv_8sc_t* taps, unsigned int num_points){
short resReal = 0;
char* resRealChar = (char*)&resReal;
resRealChar++;
short resImag = 0;
char* resImagChar = (char*)&resImag;
resImagChar++;
volk_gnsssdr_8ic_x2_dot_prod_8ic_a_orc_impl(&resReal, &resImag, input, taps, num_points);
*result = lv_cmake(*resRealChar, *resImagChar);
}
#endif /* LV_HAVE_ORC */
#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_a_H*/

View File

@ -0,0 +1,346 @@
/*!
* \file volk_gnsssdr_8ic_x2_multiply_8ic.h
* \brief Volk protokernel: multiplies two 16 bits vectors
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that multiplies two 16 bits vectors (8 bits the real part
* and 8 bits the imaginary part)
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H
#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
const lv_8sc_t* b = bVector;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(int number = 0;number < sse_iters; number++){
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
realc = _mm_and_si128 (realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_or_si128 (realc, imagc);
_mm_storeu_si128((__m128i*)c, totalc);
a += 8;
b += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
__m128i x, y, zero;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
const lv_8sc_t* b = bVector;
zero = _mm_setzero_si128();
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(int number = 0;number < sse_iters; number++){
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_blendv_epi8 (imagc, realc, mult1);
_mm_storeu_si128((__m128i*)c, totalc);
a += 8;
b += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
const lv_8sc_t* bPtr = bVector;
for(int number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) * (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_u_H */
#ifndef INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H
#define INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
__m128i x, y, mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
const lv_8sc_t* b = bVector;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(int number = 0;number < sse_iters; number++){
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
realc = _mm_and_si128 (realc, mult1);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_and_si128 (imagc, mult1);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_or_si128 (realc, imagc);
_mm_store_si128((__m128i*)c, totalc);
a += 8;
b += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
const unsigned int sse_iters = num_points / 8;
__m128i x, y, zero;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, realc, imagc, totalc;
lv_8sc_t* c = cVector;
const lv_8sc_t* a = aVector;
const lv_8sc_t* b = bVector;
zero = _mm_setzero_si128();
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
for(int number = 0;number < sse_iters; number++){
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
realc = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imagc = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
imagc = _mm_slli_si128 (imagc, 1);
totalc = _mm_blendv_epi8 (imagc, realc, mult1);
_mm_store_si128((__m128i*)c, totalc);
a += 8;
b += 8;
c += 8;
}
for (int i = 0; i<(num_points % 8); ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_generic(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
lv_8sc_t* cPtr = cVector;
const lv_8sc_t* aPtr = aVector;
const lv_8sc_t* bPtr = bVector;
for(int number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) * (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Multiplies the two input complex vectors and stores their results in the third vector
\param cVector The vector where the results will be stored
\param aVector One of the vectors to be multiplied
\param bVector One of the vectors to be multiplied
\param num_points The number of complex values in aVector and bVector to be multiplied together and stored into cVector
*/
extern void volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points);
static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_orc(lv_8sc_t* cVector, const lv_8sc_t* aVector, const lv_8sc_t* bVector, unsigned int num_points){
volk_gnsssdr_8ic_x2_multiply_8ic_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_8ic_x2_multiply_8ic_a_H */

View File

@ -0,0 +1,613 @@
/*!
* \file volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors, and accumulates the results into float32.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
* real part and 8 bits the imaginary part), and accumulates the result
* in 32 bits single point values, returning float32 values:
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 16 bits vectors) It returns the input
* signal in base band (BB)
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 16 bits vectors), accumulating the results into float32 values
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
__m128 E_code_acc, P_code_acc, L_code_acc;
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
__m128 output_ps;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
E_code_acc = _mm_setzero_ps();
L_code_acc = _mm_setzero_ps();
P_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
__m128 E_code_acc, P_code_acc, L_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 output_ps_1, output_ps_2;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
E_code_acc = _mm_setzero_ps();
L_code_acc = _mm_setzero_ps();
P_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
lv_8sc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse4_1(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
__m128 E_code_acc, P_code_acc, L_code_acc;
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
__m128 output_ps;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
E_code_acc = _mm_setzero_ps();
L_code_acc = _mm_setzero_ps();
P_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_sse2(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
__m128 E_code_acc, P_code_acc, L_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 output_ps_1, output_ps_2;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
E_code_acc = _mm_setzero_ps();
L_code_acc = _mm_setzero_ps();
P_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_generic(lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
lv_8sc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3_a_H */

View File

@ -0,0 +1,874 @@
/*!
* \file volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation with 16 bits vectors
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Early, Prompt, and Late correlation with 16 bits vectors (8 bits the
* real part and 8 bits the imaginary part):
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 16 bits vectors) It returns the input
* signal in base band (BB)
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 16 bits vectors), accumulating the results
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 16 bits vectors), accumulating the results
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 16 bits vectors), accumulating the results
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* E_code_ptr = E_code;
lv_8sc_t* E_out_ptr = E_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_8sc_t* L_out_ptr = L_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_8sc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
real_E_code_acc = _mm_setzero_si128();
imag_E_code_acc = _mm_setzero_si128();
real_L_code_acc = _mm_setzero_si128();
imag_L_code_acc = _mm_setzero_si128();
real_P_code_acc = _mm_setzero_si128();
imag_P_code_acc = _mm_setzero_si128();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
L_code_ptr += 8;
P_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
__VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
__VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1);
_mm_storeu_si128((__m128i*)E_dotProductVector, output);
imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1);
_mm_storeu_si128((__m128i*)L_dotProductVector, output);
imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1);
_mm_storeu_si128((__m128i*)P_dotProductVector, output);
for (int i = 0; i<8; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += bb_signal_sample * (*E_code_ptr++);
*P_out_ptr += bb_signal_sample * (*P_code_ptr++);
*L_out_ptr += bb_signal_sample * (*L_code_ptr++);
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* E_code_ptr = E_code;
lv_8sc_t* E_out_ptr = E_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_8sc_t* L_out_ptr = L_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_8sc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
real_E_code_acc = _mm_setzero_si128();
imag_E_code_acc = _mm_setzero_si128();
real_L_code_acc = _mm_setzero_si128();
imag_L_code_acc = _mm_setzero_si128();
real_P_code_acc = _mm_setzero_si128();
imag_P_code_acc = _mm_setzero_si128();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
L_code_ptr += 8;
P_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
__VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
__VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1);
imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1);
imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc);
_mm_storeu_si128((__m128i*)E_dotProductVector, output);
real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1);
imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1);
imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc);
_mm_storeu_si128((__m128i*)L_dotProductVector, output);
real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1);
imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1);
imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc);
_mm_storeu_si128((__m128i*)P_dotProductVector, output);
for (int i = 0; i<8; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += bb_signal_sample * (*E_code_ptr++);
*P_out_ptr += bb_signal_sample * (*P_code_ptr++);
*L_out_ptr += bb_signal_sample * (*L_code_ptr++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
lv_8sc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each
*E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i];
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse4_1(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* E_code_ptr = E_code;
lv_8sc_t* E_out_ptr = E_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_8sc_t* L_out_ptr = L_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_8sc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
real_E_code_acc = _mm_setzero_si128();
imag_E_code_acc = _mm_setzero_si128();
real_L_code_acc = _mm_setzero_si128();
imag_L_code_acc = _mm_setzero_si128();
real_P_code_acc = _mm_setzero_si128();
imag_P_code_acc = _mm_setzero_si128();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
L_code_ptr += 8;
P_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
__VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
__VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
output = _mm_blendv_epi8 (imag_E_code_acc, real_E_code_acc, mult1);
_mm_store_si128((__m128i*)E_dotProductVector, output);
imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
output = _mm_blendv_epi8 (imag_L_code_acc, real_L_code_acc, mult1);
_mm_store_si128((__m128i*)L_dotProductVector, output);
imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
output = _mm_blendv_epi8 (imag_P_code_acc, real_P_code_acc, mult1);
_mm_store_si128((__m128i*)P_dotProductVector, output);
for (int i = 0; i<8; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += bb_signal_sample * (*E_code_ptr++);
*P_out_ptr += bb_signal_sample * (*P_code_ptr++);
*L_out_ptr += bb_signal_sample * (*L_code_ptr++);
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_sse2(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample, real_E_code_acc, imag_E_code_acc, real_L_code_acc, imag_L_code_acc, real_P_code_acc, imag_P_code_acc;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* E_code_ptr = E_code;
lv_8sc_t* E_out_ptr = E_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_8sc_t* L_out_ptr = L_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_8sc_t* P_out_ptr = P_out;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
real_E_code_acc = _mm_setzero_si128();
imag_E_code_acc = _mm_setzero_si128();
real_L_code_acc = _mm_setzero_si128();
imag_L_code_acc = _mm_setzero_si128();
real_P_code_acc = _mm_setzero_si128();
imag_P_code_acc = _mm_setzero_si128();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
imagx = _mm_srli_si128 (x, 1);
imagx = _mm_and_si128 (imagx, mult1);
realx = _mm_and_si128 (x, mult1);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (realx, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imagx, imagy);
realx_mult_imagy = _mm_mullo_epi16 (realx, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imagx, realy);
real_bb_signal_sample = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_bb_signal_sample = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_E_code_acc = _mm_add_epi16 (real_E_code_acc, real_output);
imag_E_code_acc = _mm_add_epi16 (imag_E_code_acc, imag_output);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_L_code_acc = _mm_add_epi16 (real_L_code_acc, real_output);
imag_L_code_acc = _mm_add_epi16 (imag_L_code_acc, imag_output);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
imagy = _mm_srli_si128 (y, 1);
imagy = _mm_and_si128 (imagy, mult1);
realy = _mm_and_si128 (y, mult1);
realx_mult_realy = _mm_mullo_epi16 (real_bb_signal_sample, realy);
imagx_mult_imagy = _mm_mullo_epi16 (imag_bb_signal_sample, imagy);
realx_mult_imagy = _mm_mullo_epi16 (real_bb_signal_sample, imagy);
imagx_mult_realy = _mm_mullo_epi16 (imag_bb_signal_sample, realy);
real_output = _mm_sub_epi16 (realx_mult_realy, imagx_mult_imagy);
imag_output = _mm_add_epi16 (realx_mult_imagy, imagx_mult_realy);
real_P_code_acc = _mm_add_epi16 (real_P_code_acc, real_output);
imag_P_code_acc = _mm_add_epi16 (imag_P_code_acc, imag_output);
input_ptr += 8;
carrier_ptr += 8;
E_code_ptr += 8;
L_code_ptr += 8;
P_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_8sc_t E_dotProductVector[8];
__VOLK_ATTR_ALIGNED(16) lv_8sc_t L_dotProductVector[8];
__VOLK_ATTR_ALIGNED(16) lv_8sc_t P_dotProductVector[8];
real_E_code_acc = _mm_and_si128 (real_E_code_acc, mult1);
imag_E_code_acc = _mm_and_si128 (imag_E_code_acc, mult1);
imag_E_code_acc = _mm_slli_si128 (imag_E_code_acc, 1);
output = _mm_or_si128 (real_E_code_acc, imag_E_code_acc);
_mm_store_si128((__m128i*)E_dotProductVector, output);
real_L_code_acc = _mm_and_si128 (real_L_code_acc, mult1);
imag_L_code_acc = _mm_and_si128 (imag_L_code_acc, mult1);
imag_L_code_acc = _mm_slli_si128 (imag_L_code_acc, 1);
output = _mm_or_si128 (real_L_code_acc, imag_L_code_acc);
_mm_store_si128((__m128i*)L_dotProductVector, output);
real_P_code_acc = _mm_and_si128 (real_P_code_acc, mult1);
imag_P_code_acc = _mm_and_si128 (imag_P_code_acc, mult1);
imag_P_code_acc = _mm_slli_si128 (imag_P_code_acc, 1);
output = _mm_or_si128 (real_P_code_acc, imag_P_code_acc);
_mm_store_si128((__m128i*)P_dotProductVector, output);
for (int i = 0; i<8; ++i)
{
*E_out_ptr += E_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get early, late, and prompt values for each
*E_out_ptr += bb_signal_sample * (*E_code_ptr++);
*P_out_ptr += bb_signal_sample * (*P_code_ptr++);
*L_out_ptr += bb_signal_sample * (*L_code_ptr++);
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_generic(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points)
{
lv_8sc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*E_out = 0;
*P_out = 0;
*L_out = 0;
// perform Early, Prompt and Late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get early, late, and prompt values for each
*E_out += bb_signal_sample * E_code[i];
*P_out += bb_signal_sample * P_code[i];
*L_out += bb_signal_sample * L_code[i];
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param E_code Early PRN code replica input
\param P_code Early PRN code replica input
\param L_code Early PRN code replica input
\param E_out Early correlation output
\param P_out Early correlation output
\param L_out Early correlation output
\param num_points The number of complex values in vectors
*/
extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl(short* E_out_real, short* E_out_imag, short* P_out_real, short* P_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, unsigned int num_points);
extern void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl(short* L_out_real, short* L_out_imag, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* L_code, unsigned int num_points);
static inline void volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_u_orc(lv_8sc_t* E_out, lv_8sc_t* P_out, lv_8sc_t* L_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, unsigned int num_points){
short E_out_real = 0;
short E_out_imag = 0;
char* E_out_real_c = (char*)&E_out_real;
E_out_real_c++;
char* E_out_imag_c = (char*)&E_out_imag;
E_out_imag_c++;
short P_out_real = 0;
short P_out_imag = 0;
char* P_out_real_c = (char*)&P_out_real;
P_out_real_c++;
char* P_out_imag_c = (char*)&P_out_imag;
P_out_imag_c++;
short L_out_real = 0;
short L_out_imag = 0;
char* L_out_real_c = (char*)&L_out_real;
L_out_real_c++;
char* L_out_imag_c = (char*)&L_out_imag;
L_out_imag_c++;
volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_first_a_orc_impl( &E_out_real, &E_out_imag, &P_out_real, &P_out_imag, input, carrier, E_code, P_code, num_points);
volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_second_a_orc_impl( &L_out_real, &L_out_imag, input, carrier, L_code, num_points);
//ORC implementation of 8ic_x5_cw_epl_corr_8ic_x3 is done in two different functions because it seems that
//in one function the length of the code gives memory problems (bad access, segmentation fault).
//Also, the maximum number of accumulators that can be used is 4 (and we need 6).
//The "carrier wipe-off" step is done two times: one in the first function and another one in the second.
//Joining all the ORC code in one function would be quicker because the "carrier wipe-off" step would be done just
//one time.
*E_out = lv_cmake(*E_out_real_c, *E_out_imag_c);
*P_out = lv_cmake(*P_out_real_c, *P_out_imag_c);
*L_out = lv_cmake(*L_out_real_c, *L_out_imag_c);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3_a_H */

View File

@ -0,0 +1,797 @@
/*!
* \file volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. In order to avoid overflow, If input, carrier and XX_code have the same number of bits, they must be values between 3 and 3 (2 bits).
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
* real part and 8 bits the imaginary part), and accumulates the result
* in 32 bits single point values, returning float32 values:
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 16 bits vectors) It returns the input
* signal in base band (BB)
* - Very Early values are calculated by multiplying the input signal in BB by the
* very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Very Late values are calculated by multiplying the input signal in BB by the
* very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
*
* -------------------------------------------------------------------------
* Bits analysis
*
* input = 8 bits
* carrier = 8 bits
* XX_code = 8 bits
* XX_out = 8 bits
* bb_signal_sample = 8 bits
*
* bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between 7 and 7 to avoid overflow (3 bits)
*
* XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 8 bits = XX_code and bb_signal_sample must be values between 7 and 7 to avoid overflow (3 bits)
*
* conclusion = input and carrier must be values between 1 and 1 (1 bit) and XX_code must be values between 7 and 7 to avoid overflow (3 bits)
* If input, carrier and XX_code have the same number of bits, they must be values between 3 and 3 to avoid overflow (2 bits).
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
__m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
__m128 output_ps;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
*VE_out_ptr = 0;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
*VL_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
VE_code_acc = _mm_setzero_ps();
E_code_acc = _mm_setzero_ps();
P_code_acc = _mm_setzero_ps();
L_code_acc = _mm_setzero_ps();
VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*VE_out_ptr += VE_dotProductVector[i];
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*VL_out_ptr += VL_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
__m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 output_ps_1, output_ps_2;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
*VE_out_ptr = 0;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
*VL_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
VE_code_acc = _mm_setzero_ps();
E_code_acc = _mm_setzero_ps();
P_code_acc = _mm_setzero_ps();
L_code_acc = _mm_setzero_ps();
VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_storeu_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*VE_out_ptr += VE_dotProductVector[i];
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*VL_out_ptr += VL_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
lv_8sc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
// perform very early, Early, Prompt, Late and very late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
__m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
__m128i input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2;
__m128 output_ps;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
*VE_out_ptr = 0;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
*VL_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
VE_code_acc = _mm_setzero_ps();
E_code_acc = _mm_setzero_ps();
P_code_acc = _mm_setzero_ps();
L_code_acc = _mm_setzero_ps();
VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
E_code_acc = _mm_add_ps (E_code_acc, output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
P_code_acc = _mm_add_ps (P_code_acc, output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
L_code_acc = _mm_add_ps (L_code_acc, output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE4_1(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_i32_1, output_i32_2, output_ps)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*VE_out_ptr += VE_dotProductVector[i];
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*VL_out_ptr += VL_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_SSE2
#include "emmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_sse2(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, y, real_bb_signal_sample, imag_bb_signal_sample;
__m128i mult1, realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, output, real_output, imag_output;
__m128 VE_code_acc, E_code_acc, P_code_acc, L_code_acc, VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 output_ps_1, output_ps_2;
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
*VE_out_ptr = 0;
*E_out_ptr = 0;
*P_out_ptr = 0;
*L_out_ptr = 0;
*VL_out_ptr = 0;
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
VE_code_acc = _mm_setzero_ps();
E_code_acc = _mm_setzero_ps();
P_code_acc = _mm_setzero_ps();
L_code_acc = _mm_setzero_ps();
VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(x, mult1, realx, imagx)
CM_8IC_REARRANGE_VECTOR_INTO_REAL_IMAG_16IC_X2_U_SSE2(y, mult1, realy, imagy)
CM_16IC_X4_SCALAR_PRODUCT_16IC_X2_U_SSE2(realx, imagx, realy, imagy, realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_bb_signal_sample, imag_bb_signal_sample)
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_1);
VE_code_acc = _mm_add_ps (VE_code_acc, output_ps_2);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
E_code_acc = _mm_add_ps (E_code_acc, output_ps_1);
E_code_acc = _mm_add_ps (E_code_acc, output_ps_2);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
P_code_acc = _mm_add_ps (P_code_acc, output_ps_1);
P_code_acc = _mm_add_ps (P_code_acc, output_ps_2);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
L_code_acc = _mm_add_ps (L_code_acc, output_ps_1);
L_code_acc = _mm_add_ps (L_code_acc, output_ps_2);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_32FC_X2_U_SSE2(y, mult1, realy, imagy, real_bb_signal_sample, imag_bb_signal_sample,realx_mult_realy, imagx_mult_imagy, realx_mult_imagy, imagx_mult_realy, real_output, imag_output, input_i_1, input_i_2, output_i32, output_ps_1, output_ps_2)
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_1);
VL_code_acc = _mm_add_ps (VL_code_acc, output_ps_2);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VE_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t E_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t P_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t L_dotProductVector[2];
__VOLK_ATTR_ALIGNED(16) lv_32fc_t VL_dotProductVector[2];
_mm_store_ps((float*)VE_dotProductVector,VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)E_dotProductVector,E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)P_dotProductVector,P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)L_dotProductVector,L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)VL_dotProductVector,VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<2; ++i)
{
*VE_out_ptr += VE_dotProductVector[i];
*E_out_ptr += E_dotProductVector[i];
*P_out_ptr += P_dotProductVector[i];
*L_out_ptr += L_dotProductVector[i];
*VL_out_ptr += VL_dotProductVector[i];
}
}
lv_8sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * (*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * (*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * (*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * (*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
lv_8sc_t bb_signal_sample;
bb_signal_sample = lv_cmake(0, 0);
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
// perform very early, Early, Prompt, Late and very late correlation
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5_a_H */

View File

@ -0,0 +1,772 @@
/*!
* \file volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "safe" because it checks when the inputs have a -128 value, and replaces it with a -127 value. By doing this it avoids malfunctioning, but it lasts more time that the "unsafe" implementation. In order to avoid overflow, "input" and "carrier" must be values between 7 and 7 and "XX_code inputs" must be values between 127 and 127.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
* real part and 8 bits the imaginary part), and accumulates the result
* in 32 bits single point values, returning float32 values:
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 16 bits vectors) It returns the input
* signal in base band (BB)
* - Very Early values are calculated by multiplying the input signal in BB by the
* very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Very Late values are calculated by multiplying the input signal in BB by the
* very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
*
* -------------------------------------------------------------------------
* Bits analysis
*
* input = 8 bits
* carrier = 8 bits
* XX_code = 8 bits
* XX_out16 = 16 bits
* bb_signal_sample = 8 bits
*
* bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between 7 and 7 to avoid overflow (3 bits)
*
* XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between 127 and 127 to avoid overflow (7 bits)
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
__m128i real_output, imag_output;
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
__m128i minus128control;
__m128i minus128 = _mm_set1_epi8 (-128);
__m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
__m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
float VE_out_real = 0;
float VE_out_imag = 0;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
float VL_out_real = 0;
float VL_out_imag = 0;
real_VE_code_acc = _mm_setzero_ps();
imag_VE_code_acc = _mm_setzero_ps();
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
real_VL_code_acc = _mm_setzero_ps();
imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
x_abs = _mm_abs_epi8 (x);
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
if(num_points%8!=0)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t VE_code_value;
lv_16sc_t E_code_value;
lv_16sc_t P_code_value;
lv_16sc_t L_code_value;
lv_16sc_t VL_code_value;
for(int i=0; i < num_points%8; ++i)
{
VE_code_value = *VE_code_ptr++;
E_code_value = *E_code_ptr++;
P_code_value = *P_code_ptr++;
L_code_value = *L_code_ptr++;
VL_code_value = *VL_code_ptr++;
if(lv_creal(VE_code_value) == -128)
{
VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
}
if(lv_cimag(VE_code_value) == -128)
{
VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
}
if(lv_creal(E_code_value) == -128)
{
E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
}
if(lv_cimag(E_code_value) == -128)
{
E_code_value = lv_cmake(lv_creal(E_code_value), -127);
}
if(lv_creal(P_code_value) == -128)
{
P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
}
if(lv_cimag(P_code_value) == -128)
{
P_code_value = lv_cmake(lv_creal(P_code_value), -127);
}
if(lv_creal(L_code_value) == -128)
{
L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
}
if(lv_cimag(L_code_value) == -128)
{
L_code_value = lv_cmake(lv_creal(L_code_value), -127);
}
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
}
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
#include <stdio.h>
#include <tmmintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
lv_16sc_t VE_code_value;
lv_16sc_t E_code_value;
lv_16sc_t P_code_value;
lv_16sc_t L_code_value;
lv_16sc_t VL_code_value;
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points; ++i)
{
VE_code_value = VE_code[i];
E_code_value = E_code[i];
P_code_value = P_code[i];
L_code_value = L_code[i];
VL_code_value = VL_code[i];
if(lv_creal(VE_code_value) == -128)
{
VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
}
if(lv_cimag(VE_code_value) == -128)
{
VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
}
if(lv_creal(E_code_value) == -128)
{
E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
}
if(lv_cimag(E_code_value) == -128)
{
E_code_value = lv_cmake(lv_creal(E_code_value), -127);
}
if(lv_creal(P_code_value) == -128)
{
P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
}
if(lv_cimag(P_code_value) == -128)
{
P_code_value = lv_cmake(lv_creal(P_code_value), -127);
}
if(lv_creal(L_code_value) == -128)
{
L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
}
if(lv_cimag(L_code_value) == -128)
{
L_code_value = lv_cmake(lv_creal(L_code_value), -127);
}
if(lv_creal(VL_code_value) == -128)
{
VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
}
if(lv_cimag(VL_code_value) == -128)
{
VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
}
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get very early, early, prompt, late and very late values for each
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
__m128i real_output, imag_output;
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
__m128i minus128control;
__m128i minus128 = _mm_set1_epi8 (-128);
__m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
__m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
float VE_out_real = 0;
float VE_out_imag = 0;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
float VL_out_real = 0;
float VL_out_imag = 0;
real_VE_code_acc = _mm_setzero_ps();
imag_VE_code_acc = _mm_setzero_ps();
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
real_VL_code_acc = _mm_setzero_ps();
imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
x_abs = _mm_abs_epi8 (x);
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_SAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, minus128, minus128control, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
if(num_points%8!=0)
{
lv_16sc_t bb_signal_sample;
lv_16sc_t VE_code_value;
lv_16sc_t E_code_value;
lv_16sc_t P_code_value;
lv_16sc_t L_code_value;
lv_16sc_t VL_code_value;
for(int i=0; i < num_points%8; ++i)
{
VE_code_value = *VE_code_ptr++;
E_code_value = *E_code_ptr++;
P_code_value = *P_code_ptr++;
L_code_value = *L_code_ptr++;
VL_code_value = *VL_code_ptr++;
if(lv_creal(VE_code_value) == -128)
{
VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
}
if(lv_cimag(VE_code_value) == -128)
{
VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
}
if(lv_creal(E_code_value) == -128)
{
E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
}
if(lv_cimag(E_code_value) == -128)
{
E_code_value = lv_cmake(lv_creal(E_code_value), -127);
}
if(lv_creal(P_code_value) == -128)
{
P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
}
if(lv_cimag(P_code_value) == -128)
{
P_code_value = lv_cmake(lv_creal(P_code_value), -127);
}
if(lv_creal(L_code_value) == -128)
{
L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
}
if(lv_cimag(L_code_value) == -128)
{
L_code_value = lv_cmake(lv_creal(L_code_value), -127);
}
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * VE_code_value);
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * E_code_value);
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * P_code_value);
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * L_code_value);
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * VL_code_value);
}
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
#include <stdio.h>
#include <tmmintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
lv_16sc_t VE_code_value;
lv_16sc_t E_code_value;
lv_16sc_t P_code_value;
lv_16sc_t L_code_value;
lv_16sc_t VL_code_value;
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points; ++i)
{
VE_code_value = VE_code[i];
E_code_value = E_code[i];
P_code_value = P_code[i];
L_code_value = L_code[i];
VL_code_value = VL_code[i];
if(lv_creal(VE_code_value) == -128)
{
VE_code_value = lv_cmake(-127, lv_cimag(VE_code_value));
}
if(lv_cimag(VE_code_value) == -128)
{
VE_code_value = lv_cmake(lv_creal(VE_code_value), -127);
}
if(lv_creal(E_code_value) == -128)
{
E_code_value = lv_cmake(-127, lv_cimag(E_code_value));
}
if(lv_cimag(E_code_value) == -128)
{
E_code_value = lv_cmake(lv_creal(E_code_value), -127);
}
if(lv_creal(P_code_value) == -128)
{
P_code_value = lv_cmake(-127, lv_cimag(P_code_value));
}
if(lv_cimag(P_code_value) == -128)
{
P_code_value = lv_cmake(lv_creal(P_code_value), -127);
}
if(lv_creal(L_code_value) == -128)
{
L_code_value = lv_cmake(-127, lv_cimag(L_code_value));
}
if(lv_cimag(L_code_value) == -128)
{
L_code_value = lv_cmake(lv_creal(L_code_value), -127);
}
if(lv_creal(VL_code_value) == -128)
{
VL_code_value = lv_cmake(-127, lv_cimag(VL_code_value));
}
if(lv_cimag(VL_code_value) == -128)
{
VL_code_value = lv_cmake(lv_creal(VL_code_value), -127);
}
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get very early, early, prompt, late and very late values for each
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code_value);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code_value);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code_value);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code_value);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code_value);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5_a_H */

View File

@ -0,0 +1,554 @@
/*!
* \file volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5.h
* \brief Volk protokernel: performs the carrier wipe-off mixing and the Very early, Early, Prompt, Late and very late correlation with 16 bits vectors, and accumulates the results into float32. This protokernel is called "unsafe" because it does NOT check when the inputs have a -128 value. If you introduce a -128 value the protokernel will NOT operate properly (generic implementation will have different results than volk implementation). In order to avoid overflow, "input" and "carrier" must be values between 7 and 7 and "XX_code inputs" must be values between 127 and 127.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that performs the carrier wipe-off mixing and the
* Very early, Early, Prompt, Late and very late correlation with 16 bits vectors (8 bits the
* real part and 8 bits the imaginary part), and accumulates the result
* in 32 bits single point values, returning float32 values:
* - The carrier wipe-off is done by multiplying the input signal by the
* carrier (multiplication of 16 bits vectors) It returns the input
* signal in base band (BB)
* - Very Early values are calculated by multiplying the input signal in BB by the
* very early code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Early values are calculated by multiplying the input signal in BB by the
* early code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Prompt values are calculated by multiplying the input signal in BB by the
* prompt code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Late values are calculated by multiplying the input signal in BB by the
* late code (multiplication of 16 bits vectors), accumulating the results into float32 values
* - Very Late values are calculated by multiplying the input signal in BB by the
* very late code (multiplication of 16 bits vectors), accumulating the results into float32 values
*
* -------------------------------------------------------------------------
* Bits analysis
*
* input = 8 bits
* carrier = 8 bits
* XX_code = 8 bits
* XX_out16 = 16 bits
* bb_signal_sample = 8 bits
*
* bb_signal_sample = input*carrier -> 17 bits limited to 8 bits = input and carrier must be values between 7 and 7 to avoid overflow (3 bits)
*
* XX_out16 = XX_code*bb_signal_sample -> 17 bits limited to 16 bits = XX_code must be values between 127 and 127 to avoid overflow (7 bits)
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
__m128i real_output, imag_output;
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
__m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
__m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
float VE_out_real = 0;
float VE_out_imag = 0;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
float VL_out_real = 0;
float VL_out_imag = 0;
real_VE_code_acc = _mm_setzero_ps();
imag_VE_code_acc = _mm_setzero_ps();
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
real_VL_code_acc = _mm_setzero_ps();
imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_lddqu_si128((__m128i*)input_ptr);
y = _mm_lddqu_si128((__m128i*)carrier_ptr);
x_abs = _mm_abs_epi8 (x);
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_lddqu_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_lddqu_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_lddqu_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_lddqu_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_lddqu_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_storeu_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_storeu_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
#include <stdio.h>
#include <tmmintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get very early, early, prompt, late and very late values for each
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_u_H */
#ifndef INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H
#define INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H
#include <inttypes.h>
#include <stdio.h>
#include <volk_gnsssdr/volk_gnsssdr_complex.h>
#include <float.h>
#include <string.h>
#ifdef LV_HAVE_SSE4_1
#include "smmintrin.h"
#include "CommonMacros/CommonMacros_8ic_cw_epl_corr_32fc.h"
#include "CommonMacros/CommonMacros.h"
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_sse4_1(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 8;
__m128i x, x_abs, y, y_aux, bb_signal_sample_aux, bb_signal_sample_aux_abs;;
__m128i real_output, imag_output;
__m128 real_VE_code_acc, imag_VE_code_acc, real_E_code_acc, imag_E_code_acc, real_P_code_acc, imag_P_code_acc, real_L_code_acc, imag_L_code_acc, real_VL_code_acc, imag_VL_code_acc;
__m128i input_i_1, input_i_2, output_i32;
__m128 real_output_ps, imag_output_ps;
__m128i check_sign_sequence = _mm_set_epi8 (255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1, 255, 1);
__m128i rearrange_sequence = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m128i mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
const lv_8sc_t* input_ptr = input;
const lv_8sc_t* carrier_ptr = carrier;
const lv_8sc_t* VE_code_ptr = VE_code;
lv_32fc_t* VE_out_ptr = VE_out;
const lv_8sc_t* E_code_ptr = E_code;
lv_32fc_t* E_out_ptr = E_out;
const lv_8sc_t* P_code_ptr = P_code;
lv_32fc_t* P_out_ptr = P_out;
const lv_8sc_t* L_code_ptr = L_code;
lv_32fc_t* L_out_ptr = L_out;
const lv_8sc_t* VL_code_ptr = VL_code;
lv_32fc_t* VL_out_ptr = VL_out;
float VE_out_real = 0;
float VE_out_imag = 0;
float E_out_real = 0;
float E_out_imag = 0;
float P_out_real = 0;
float P_out_imag = 0;
float L_out_real = 0;
float L_out_imag = 0;
float VL_out_real = 0;
float VL_out_imag = 0;
real_VE_code_acc = _mm_setzero_ps();
imag_VE_code_acc = _mm_setzero_ps();
real_E_code_acc = _mm_setzero_ps();
imag_E_code_acc = _mm_setzero_ps();
real_P_code_acc = _mm_setzero_ps();
imag_P_code_acc = _mm_setzero_ps();
real_L_code_acc = _mm_setzero_ps();
imag_L_code_acc = _mm_setzero_ps();
real_VL_code_acc = _mm_setzero_ps();
imag_VL_code_acc = _mm_setzero_ps();
if (sse_iters>0)
{
for(int number = 0;number < sse_iters; number++){
//Perform the carrier wipe-off
x = _mm_load_si128((__m128i*)input_ptr);
y = _mm_load_si128((__m128i*)carrier_ptr);
x_abs = _mm_abs_epi8 (x);
CM_8IC_X2_SCALAR_PRODUCT_16IC_X2_U_SSSE3(y, x, check_sign_sequence, rearrange_sequence, y_aux, x_abs, real_output, imag_output)
imag_output = _mm_slli_si128 (imag_output, 1);
bb_signal_sample_aux = _mm_blendv_epi8 (imag_output, real_output, mult1);
bb_signal_sample_aux_abs = _mm_abs_epi8 (bb_signal_sample_aux);
//Get very early values
y = _mm_load_si128((__m128i*)VE_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VE_code_acc = _mm_add_ps (real_VE_code_acc, real_output_ps);
imag_VE_code_acc = _mm_add_ps (imag_VE_code_acc, imag_output_ps);
//Get early values
y = _mm_load_si128((__m128i*)E_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_E_code_acc = _mm_add_ps (real_E_code_acc, real_output_ps);
imag_E_code_acc = _mm_add_ps (imag_E_code_acc, imag_output_ps);
//Get prompt values
y = _mm_load_si128((__m128i*)P_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_P_code_acc = _mm_add_ps (real_P_code_acc, real_output_ps);
imag_P_code_acc = _mm_add_ps (imag_P_code_acc, imag_output_ps);
//Get late values
y = _mm_load_si128((__m128i*)L_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_L_code_acc = _mm_add_ps (real_L_code_acc, real_output_ps);
imag_L_code_acc = _mm_add_ps (imag_L_code_acc, imag_output_ps);
//Get very late values
y = _mm_load_si128((__m128i*)VL_code_ptr);
CM_8IC_X2_CW_CORR_UNSAFE_32FC_X2_U_SSE4_1(y, bb_signal_sample_aux, check_sign_sequence, rearrange_sequence, y_aux, bb_signal_sample_aux_abs, real_output, imag_output, input_i_1, input_i_2, output_i32, real_output_ps, imag_output_ps)
real_VL_code_acc = _mm_add_ps (real_VL_code_acc, real_output_ps);
imag_VL_code_acc = _mm_add_ps (imag_VL_code_acc, imag_output_ps);
input_ptr += 8;
carrier_ptr += 8;
VE_code_ptr += 8;
E_code_ptr += 8;
P_code_ptr += 8;
L_code_ptr += 8;
VL_code_ptr += 8;
}
__VOLK_ATTR_ALIGNED(16) float real_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VE_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_E_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_P_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_L_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float real_VL_dotProductVector[4];
__VOLK_ATTR_ALIGNED(16) float imag_VL_dotProductVector[4];
_mm_store_ps((float*)real_VE_dotProductVector,real_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VE_dotProductVector,imag_VE_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_E_dotProductVector,real_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_E_dotProductVector,imag_E_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_P_dotProductVector,real_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_P_dotProductVector,imag_P_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_L_dotProductVector,real_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_L_dotProductVector,imag_L_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)real_VL_dotProductVector,real_VL_code_acc); // Store the results back into the dot product vector
_mm_store_ps((float*)imag_VL_dotProductVector,imag_VL_code_acc); // Store the results back into the dot product vector
for (int i = 0; i<4; ++i)
{
VE_out_real += real_VE_dotProductVector[i];
VE_out_imag += imag_VE_dotProductVector[i];
E_out_real += real_E_dotProductVector[i];
E_out_imag += imag_E_dotProductVector[i];
P_out_real += real_P_dotProductVector[i];
P_out_imag += imag_P_dotProductVector[i];
L_out_real += real_L_dotProductVector[i];
L_out_imag += imag_L_dotProductVector[i];
VL_out_real += real_VL_dotProductVector[i];
VL_out_imag += imag_VL_dotProductVector[i];
}
*VE_out_ptr = lv_cmake(VE_out_real, VE_out_imag);
*E_out_ptr = lv_cmake(E_out_real, E_out_imag);
*P_out_ptr = lv_cmake(P_out_real, P_out_imag);
*L_out_ptr = lv_cmake(L_out_real, L_out_imag);
*VL_out_ptr = lv_cmake(VL_out_real, VL_out_imag);
}
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points%8; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = (*input_ptr++) * (*carrier_ptr++);
// Now get very early, early, prompt, late and very late values for each
*VE_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VE_code_ptr++));
*E_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*E_code_ptr++));
*P_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*P_code_ptr++));
*L_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*L_code_ptr++));
*VL_out_ptr += (lv_32fc_t) (bb_signal_sample * ((lv_16sc_t)*VL_code_ptr++));
}
}
#endif /* LV_HAVE_SSE4_1 */
#ifdef LV_HAVE_GENERIC
#include <stdio.h>
#include <tmmintrin.h>
/*!
\brief Performs the carrier wipe-off mixing and the Early, Prompt, and Late correlation
\param input The input signal input
\param carrier The carrier signal input
\param VE_code Very Early PRN code replica input
\param E_code Early PRN code replica input
\param P_code Prompt PRN code replica input
\param L_code Late PRN code replica input
\param VL_code Very Late PRN code replica input
\param VE_out Very Early correlation output
\param E_out Early correlation output
\param P_out Prompt correlation output
\param L_out Late correlation output
\param VL_out Very Late correlation output
\param num_points The number of complex values in vectors
*/
static inline void volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_generic(lv_32fc_t* VE_out, lv_32fc_t* E_out, lv_32fc_t* P_out, lv_32fc_t* L_out, lv_32fc_t* VL_out, const lv_8sc_t* input, const lv_8sc_t* carrier, const lv_8sc_t* VE_code, const lv_8sc_t* E_code, const lv_8sc_t* P_code, const lv_8sc_t* L_code, const lv_8sc_t* VL_code, unsigned int num_points)
{
*VE_out = 0;
*E_out = 0;
*P_out = 0;
*L_out = 0;
*VL_out = 0;
lv_16sc_t bb_signal_sample;
for(int i=0; i < num_points; ++i)
{
//Perform the carrier wipe-off
bb_signal_sample = input[i] * carrier[i];
// Now get very early, early, prompt, late and very late values for each
*VE_out += (lv_32fc_t) (bb_signal_sample * VE_code[i]);
*E_out += (lv_32fc_t) (bb_signal_sample * E_code[i]);
*P_out += (lv_32fc_t) (bb_signal_sample * P_code[i]);
*L_out += (lv_32fc_t) (bb_signal_sample * L_code[i]);
*VL_out += (lv_32fc_t) (bb_signal_sample * VL_code[i]);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_gnsssdr_volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5_a_H */

View File

@ -0,0 +1,210 @@
/*!
* \file volk_gnsssdr_8u_x2_multiply_8u.h
* \brief Volk protokernel: multiplies unsigned char values
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that multiplies unsigned char values (8 bits data)
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H
#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
#include <emmintrin.h>
/*!
\brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
\param cChar The unsigned char where the results will be stored
\param aChar One of the unsigned char to be multiplied
\param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
const unsigned int sse_iters = num_points / 16;
__m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
unsigned char* c = cChar;
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(int number = 0;number < sse_iters; number++){
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
x1 = _mm_srli_si128 (x, 1);
x1 = _mm_and_si128 (x1, mult1);
x2 = _mm_and_si128 (x, mult1);
y1 = _mm_srli_si128 (y, 1);
y1 = _mm_and_si128 (y1, mult1);
y2 = _mm_and_si128 (y, mult1);
x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
tmp = _mm_and_si128 (x1_mult_y1, mult1);
tmp1 = _mm_slli_si128 (tmp, 1);
tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
totalc = _mm_or_si128 (tmp1, tmp2);
_mm_storeu_si128((__m128i*)c, totalc);
a += 16;
b += 16;
c += 16;
}
for (int i = 0; i<(num_points % 16); ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
\param cChar The unsigned char where the results will be stored
\param aChar One of the unsigned char to be multiplied
\param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/
static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
unsigned char* cPtr = cChar;
const unsigned char* aPtr = aChar;
const unsigned char* bPtr = bChar;
for(int number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) * (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_u_H */
#ifndef INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H
#define INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
#include <emmintrin.h>
/*!
\brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
\param cChar The unsigned char where the results will be stored
\param aChar One of the unsigned char to be multiplied
\param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
const unsigned int sse_iters = num_points / 16;
__m128i x, y, x1, x2, y1, y2, mult1, x1_mult_y1, x2_mult_y2, tmp, tmp1, tmp2, totalc;
unsigned char* c = cChar;
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(int number = 0;number < sse_iters; number++){
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
mult1 = _mm_set_epi8(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
x1 = _mm_srli_si128 (x, 1);
x1 = _mm_and_si128 (x1, mult1);
x2 = _mm_and_si128 (x, mult1);
y1 = _mm_srli_si128 (y, 1);
y1 = _mm_and_si128 (y1, mult1);
y2 = _mm_and_si128 (y, mult1);
x1_mult_y1 = _mm_mullo_epi16 (x1, y1);
x2_mult_y2 = _mm_mullo_epi16 (x2, y2);
tmp = _mm_and_si128 (x1_mult_y1, mult1);
tmp1 = _mm_slli_si128 (tmp, 1);
tmp2 = _mm_and_si128 (x2_mult_y2, mult1);
totalc = _mm_or_si128 (tmp1, tmp2);
_mm_store_si128((__m128i*)c, totalc);
a += 16;
b += 16;
c += 16;
}
for (int i = 0; i<(num_points % 16); ++i)
{
*c++ = (*a++) * (*b++);
}
}
#endif /* LV_HAVE_SSE */
#ifdef LV_HAVE_GENERIC
/*!
\brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
\param cChar The unsigned char where the results will be stored
\param aChar One of the unsigned char to be multiplied
\param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/
static inline void volk_gnsssdr_8u_x2_multiply_8u_a_generic(unsigned char* cChar, const unsigned char* aChar, const unsigned char* bChar, unsigned int num_points){
unsigned char* cPtr = cChar;
const unsigned char* aPtr = aChar;
const unsigned char* bPtr = bChar;
for(int number = 0; number < num_points; number++){
*cPtr++ = (*aPtr++) * (*bPtr++);
}
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_ORC
/*!
\brief Multiplies the two input unsigned char values and stores their results in the third unisgned char
\param cChar The unsigned char where the results will be stored
\param aChar One of the unsigned char to be multiplied
\param bChar One of the unsigned char to be multiplied
\param num_points The number of unsigned char values in aChar and bChar to be multiplied together and stored into cChar
*/
extern void volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points);
static inline void volk_gnsssdr_8u_x2_multiply_8u_u_orc(unsigned char* cVector, const unsigned char* aVector, const unsigned char* bVector, unsigned int num_points){
volk_gnsssdr_8u_x2_multiply_8u_a_orc_impl(cVector, aVector, bVector, num_points);
}
#endif /* LV_HAVE_ORC */
#endif /* INCLUDED_volk_gnsssdr_8u_x2_multiply_8u_a_H */

View File

@ -0,0 +1,866 @@
/*!
* \file volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc
* \brief Volk protokernel: replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia.
* \authors <ul>
* <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
* </ul>
*
* Volk protokernel that replaces the tracking function for update_local_carrier. Algorithm by Julien Pommier and Giovanni Garberoglio, modified by Andrés Cecilia.
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2007 Julien Pommier
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
*(this is the zlib license)
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2012 Giovanni Garberoglio
* Interdisciplinary Laboratory for Computational Science (LISC)
* Fondazione Bruno Kessler and University of Trento
* via Sommarive, 18
* I-38123 Trento (Italy)
*
* -------------------------------------------------------------------------
*
* Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
*
* GNSS-SDR is a software defined Global Navigation
* Satellite Systems receiver
*
* This file is part of GNSS-SDR.
*
* GNSS-SDR is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* at your option) any later version.
*
* GNSS-SDR is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*
* -------------------------------------------------------------------------
*/
#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H
#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <tmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
// float* pointer1 = (float*)&phase_rad_init;
// *pointer1 = 0;
// float* pointer2 = (float*)&phase_step_rad;
// *pointer2 = 0.5;
const unsigned int sse_iters = num_points / 8;
__m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f);
__m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f);
__m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f);
__m256 _ps256_sign_mask = _mm256_set1_ps(-0.f);
__m128i _pi32avx_1 = _mm_set1_epi32(1);
__m128i _pi32avx_inv1 = _mm_set1_epi32(~1);
__m128i _pi32avx_2 = _mm_set1_epi32(2);
__m128i _pi32avx_4 = _mm_set1_epi32(4);
__m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI
__m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f);
__m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f);
__m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f);
__m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f);
__m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f);
__m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f);
__m256 _ps256_1 = _mm256_set1_ps(1.f);
__m256 _ps256_0p5 = _mm256_set1_ps(0.5f);
__m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad);
__m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
__m256 xmm1, xmm2, xmm3, sign_bit_sin;
__m256i imm0, imm2, imm4;
__m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2;
__VOLK_ATTR_ALIGNED(32) float sin_value[8];
__VOLK_ATTR_ALIGNED(32) float cos_value[8];
phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
for(int i = 0; i < sse_iters; i++)
{
x = phase_rad_array;
/* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask);
/* take the absolute value */
x = _mm256_xor_ps(x, sign_bit_sin);
/* scale by 4/Pi */
y = _mm256_mul_ps(x, _ps256_cephes_FOPI);
/* we use SSE2 routines to perform the integer ops */
//COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
y = _mm256_cvttps_epi32(y);
imm2_1 = _mm256_extractf128_ps (y, 0);
imm2_2 = _mm256_extractf128_ps (y, 1);
imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1);
imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1);
//COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
//_mm256_set_m128i not defined in some versions of immintrin.h
//imm2 = _mm256_set_m128i (imm2_2, imm2_1);
imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
y = _mm256_cvtepi32_ps(imm2);
imm4_1 = imm2_1;
imm4_2 = imm2_2;
imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4);
imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4);
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
//COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
//_mm256_set_m128i not defined in some versions of immintrin.h
//imm0 = _mm256_set_m128i (imm0_2, imm0_1);
imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1);
imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2);
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
//COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
//_mm256_set_m128i not defined in some versions of immintrin.h
//imm2 = _mm256_set_m128i (imm2_2, imm2_1);
imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
poly_mask = _mm256_castsi256_ps(imm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = _ps256_minus_cephes_DP1;
xmm2 = _ps256_minus_cephes_DP2;
xmm3 = _ps256_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
x = _mm256_add_ps(x, xmm1);
x = _mm256_add_ps(x, xmm2);
x = _mm256_add_ps(x, xmm3);
imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2);
imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2);
imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4);
imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4);
imm4_1 = _mm_slli_epi32(imm4_1, 29);
imm4_2 = _mm_slli_epi32(imm4_2, 29);
//COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
//_mm256_set_m128i not defined in some versions of immintrin.h
//imm4 = _mm256_set_m128i (imm4_2, imm4_1);
imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1);
sign_bit_cos = _mm256_castsi256_ps(imm4);
sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
z = _mm256_mul_ps(x,x);
y = _ps256_coscof_p0;
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, _ps256_coscof_p1);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, _ps256_coscof_p2);
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
tmp = _mm256_mul_ps(z, _ps256_0p5);
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, _ps256_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
y2 = _ps256_sincof_p0;
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, _ps256_sincof_p1);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, _ps256_sincof_p2);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
ysin2 = _mm256_and_ps(xmm3, y2);
ysin1 = _mm256_andnot_ps(xmm3, y);
y2 = _mm256_sub_ps(y2,ysin2);
y = _mm256_sub_ps(y, ysin1);
xmm1 = _mm256_add_ps(ysin1,ysin2);
xmm2 = _mm256_add_ps(y,y2);
/* update the sign */
s = _mm256_xor_ps(xmm1, sign_bit_sin);
c = _mm256_xor_ps(xmm2, sign_bit_cos);
//GNSS-SDR needs to return -sin
s = _mm256_xor_ps(s, _ps256_sign_mask);
_mm256_storeu_ps ((float*)sin_value, s);
_mm256_storeu_ps ((float*)cos_value, c);
for(int i = 0; i < 8; i++)
{
d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
}
d_carr_sign += 8;
phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array);
}
if (num_points%8!=0)
{
__VOLK_ATTR_ALIGNED(32) float phase_rad_store[8];
_mm256_storeu_si256 ((float*)phase_rad_store, phase_rad_array);
float phase_rad = phase_rad_store[0];
for(int i = 0; i < num_points%8; i++)
{
*d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
d_carr_sign++;
phase_rad += phase_step_rad;
}
}
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_u_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
// float* pointer1 = (float*)&phase_rad_init;
// *pointer1 = 0;
// float* pointer2 = (float*)&phase_step_rad;
// *pointer2 = 0.5;
const unsigned int sse_iters = num_points / 4;
__m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
__m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
__m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
__m128 _ps_sign_mask = _mm_set1_ps(-0.f);
__m128i _pi32_1 = _mm_set1_epi32(1);
__m128i _pi32_inv1 = _mm_set1_epi32(~1);
__m128i _pi32_2 = _mm_set1_epi32(2);
__m128i _pi32_4 = _mm_set1_epi32(4);
__m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI
__m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
__m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
__m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
__m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
__m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
__m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
__m128 _ps_1 = _mm_set1_ps(1.f);
__m128 _ps_0p5 = _mm_set1_ps(0.5f);
__m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad);
__m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
__m128 xmm1, xmm2, xmm3, sign_bit_sin;
__m128i emm0, emm2, emm4;
__VOLK_ATTR_ALIGNED(16) float sin_value[4];
__VOLK_ATTR_ALIGNED(16) float cos_value[4];
phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
for(int i = 0; i < sse_iters; i++)
{
x = phase_rad_array;
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
/* take the absolute value */
x = _mm_xor_ps(x, sign_bit_sin);
/* scale by 4/Pi */
y = _mm_mul_ps(x, _ps_cephes_FOPI);
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, _pi32_1);
emm2 = _mm_and_si128(emm2, _pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, _pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, _pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
poly_mask = _mm_castsi128_ps(emm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
emm4 = _mm_sub_epi32(emm4, _pi32_2);
emm4 = _mm_andnot_si128(emm4, _pi32_4);
emm4 = _mm_slli_epi32(emm4, 29);
sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
z = _mm_mul_ps(x,x);
y = _ps_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, _ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, _ps_coscof_p2);
y = _mm_mul_ps(y, _mm_mul_ps(z, z));
tmp = _mm_mul_ps(z, _ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, _ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
y2 = _ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, _ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, _ps_sincof_p2);
y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
ysin2 = _mm_and_ps(xmm3, y2);
ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
/* update the sign */
s = _mm_xor_ps(xmm1, sign_bit_sin);
c = _mm_xor_ps(xmm2, sign_bit_cos);
//GNSS-SDR needs to return -sin
s = _mm_xor_ps(s, _ps_sign_mask);
_mm_storeu_ps ((float*)sin_value, s);
_mm_storeu_ps ((float*)cos_value, c);
for(int i = 0; i < 4; i++)
{
d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
}
d_carr_sign += 4;
phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array);
}
if (num_points%4!=0)
{
__VOLK_ATTR_ALIGNED(16) float phase_rad_store[4];
_mm_storeu_si128 ((__m128i*)phase_rad_store, phase_rad_array);
float phase_rad = phase_rad_store[0];
for(int i = 0; i < num_points%4; i++)
{
*d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
d_carr_sign++;
phase_rad += phase_step_rad;
}
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
// float* pointer1 = (float*)&phase_rad_init;
// *pointer1 = 0;
// float* pointer2 = (float*)&phase_step_rad;
// *pointer2 = 0.5;
float phase_rad = phase_rad_init;
for(int i = 0; i < num_points; i++)
{
*d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
d_carr_sign++;
phase_rad += phase_step_rad;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_u_H */
#ifndef INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H
#define INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <inttypes.h>
#include <stdio.h>
#ifdef LV_HAVE_AVX
#include <tmmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_avx(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
// float* pointer1 = (float*)&phase_rad_init;
// *pointer1 = 0;
// float* pointer2 = (float*)&phase_step_rad;
// *pointer2 = 0.5;
const unsigned int sse_iters = num_points / 8;
__m256 _ps256_minus_cephes_DP1 = _mm256_set1_ps(-0.78515625f);
__m256 _ps256_minus_cephes_DP2 = _mm256_set1_ps(-2.4187564849853515625e-4f);
__m256 _ps256_minus_cephes_DP3 = _mm256_set1_ps(-3.77489497744594108e-8f);
__m256 _ps256_sign_mask = _mm256_set1_ps(-0.f);
__m128i _pi32avx_1 = _mm_set1_epi32(1);
__m128i _pi32avx_inv1 = _mm_set1_epi32(~1);
__m128i _pi32avx_2 = _mm_set1_epi32(2);
__m128i _pi32avx_4 = _mm_set1_epi32(4);
__m256 _ps256_cephes_FOPI = _mm256_set1_ps(1.27323954473516f); // 4 / PI
__m256 _ps256_sincof_p0 = _mm256_set1_ps(-1.9515295891E-4f);
__m256 _ps256_sincof_p1 = _mm256_set1_ps( 8.3321608736E-3f);
__m256 _ps256_sincof_p2 = _mm256_set1_ps(-1.6666654611E-1f);
__m256 _ps256_coscof_p0 = _mm256_set1_ps( 2.443315711809948E-005f);
__m256 _ps256_coscof_p1 = _mm256_set1_ps(-1.388731625493765E-003f);
__m256 _ps256_coscof_p2 = _mm256_set1_ps( 4.166664568298827E-002f);
__m256 _ps256_1 = _mm256_set1_ps(1.f);
__m256 _ps256_0p5 = _mm256_set1_ps(0.5f);
__m256 phase_step_rad_array = _mm256_set1_ps(8*phase_step_rad);
__m256 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
__m256 xmm1, xmm2, xmm3, sign_bit_sin;
__m256i imm0, imm2, imm4;
__m128i imm0_1, imm0_2, imm2_1, imm2_2, imm4_1, imm4_2;
__VOLK_ATTR_ALIGNED(32) float sin_value[8];
__VOLK_ATTR_ALIGNED(32) float cos_value[8];
phase_rad_array = _mm256_set_ps (phase_rad_init+7*phase_step_rad, phase_rad_init+6*phase_step_rad, phase_rad_init+5*phase_step_rad, phase_rad_init+4*phase_step_rad, phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
for(int i = 0; i < sse_iters; i++)
{
x = phase_rad_array;
/* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(x, _ps256_sign_mask);
/* take the absolute value */
x = _mm256_xor_ps(x, sign_bit_sin);
/* scale by 4/Pi */
y = _mm256_mul_ps(x, _ps256_cephes_FOPI);
/* we use SSE2 routines to perform the integer ops */
//COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);
y = _mm256_cvttps_epi32(y);
imm2_1 = _mm256_extractf128_ps (y, 0);
imm2_2 = _mm256_extractf128_ps (y, 1);
imm2_1 = _mm_add_epi32(imm2_1, _pi32avx_1);
imm2_2 = _mm_add_epi32(imm2_2, _pi32avx_1);
imm2_1 = _mm_and_si128(imm2_1, _pi32avx_inv1);
imm2_2 = _mm_and_si128(imm2_2, _pi32avx_inv1);
//COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
//_mm256_set_m128i not defined in some versions of immintrin.h
//imm2 = _mm256_set_m128i (imm2_2, imm2_1);
imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
y = _mm256_cvtepi32_ps(imm2);
imm4_1 = imm2_1;
imm4_2 = imm2_2;
imm0_1 = _mm_and_si128(imm2_1, _pi32avx_4);
imm0_2 = _mm_and_si128(imm2_2, _pi32avx_4);
imm0_1 = _mm_slli_epi32(imm0_1, 29);
imm0_2 = _mm_slli_epi32(imm0_2, 29);
//COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
//_mm256_set_m128i not defined in some versions of immintrin.h
//imm0 = _mm256_set_m128i (imm0_2, imm0_1);
imm0 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm0_1),(imm0_2),1);
imm2_1 = _mm_and_si128(imm2_1, _pi32avx_2);
imm2_2 = _mm_and_si128(imm2_2, _pi32avx_2);
imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
//COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
//_mm256_set_m128i not defined in some versions of immintrin.h
//imm2 = _mm256_set_m128i (imm2_2, imm2_1);
imm2 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm2_1),(imm2_2),1);
swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
poly_mask = _mm256_castsi256_ps(imm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = _ps256_minus_cephes_DP1;
xmm2 = _ps256_minus_cephes_DP2;
xmm3 = _ps256_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
x = _mm256_add_ps(x, xmm1);
x = _mm256_add_ps(x, xmm2);
x = _mm256_add_ps(x, xmm3);
imm4_1 = _mm_sub_epi32(imm4_1, _pi32avx_2);
imm4_2 = _mm_sub_epi32(imm4_2, _pi32avx_2);
imm4_1 = _mm_andnot_si128(imm4_1, _pi32avx_4);
imm4_2 = _mm_andnot_si128(imm4_2, _pi32avx_4);
imm4_1 = _mm_slli_epi32(imm4_1, 29);
imm4_2 = _mm_slli_epi32(imm4_2, 29);
//COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
//_mm256_set_m128i not defined in some versions of immintrin.h
//imm4 = _mm256_set_m128i (imm4_2, imm4_1);
imm4 = _mm256_insertf128_si256(_mm256_castsi128_si256(imm4_1),(imm4_2),1);
sign_bit_cos = _mm256_castsi256_ps(imm4);
sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
z = _mm256_mul_ps(x,x);
y = _ps256_coscof_p0;
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, _ps256_coscof_p1);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, _ps256_coscof_p2);
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
tmp = _mm256_mul_ps(z, _ps256_0p5);
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, _ps256_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
y2 = _ps256_sincof_p0;
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, _ps256_sincof_p1);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, _ps256_sincof_p2);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
ysin2 = _mm256_and_ps(xmm3, y2);
ysin1 = _mm256_andnot_ps(xmm3, y);
y2 = _mm256_sub_ps(y2,ysin2);
y = _mm256_sub_ps(y, ysin1);
xmm1 = _mm256_add_ps(ysin1,ysin2);
xmm2 = _mm256_add_ps(y,y2);
/* update the sign */
s = _mm256_xor_ps(xmm1, sign_bit_sin);
c = _mm256_xor_ps(xmm2, sign_bit_cos);
//GNSS-SDR needs to return -sin
s = _mm256_xor_ps(s, _ps256_sign_mask);
_mm256_store_ps ((float*)sin_value, s);
_mm256_store_ps ((float*)cos_value, c);
for(int i = 0; i < 8; i++)
{
d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
}
d_carr_sign += 8;
phase_rad_array = _mm256_add_ps (phase_rad_array, phase_step_rad_array);
}
if (num_points%8!=0)
{
__VOLK_ATTR_ALIGNED(32) float phase_rad_store[8];
_mm256_store_ps ((float*)phase_rad_store, phase_rad_array);
float phase_rad = phase_rad_store[0];
for(int i = 0; i < num_points%8; i++)
{
*d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
d_carr_sign++;
phase_rad += phase_step_rad;
}
}
}
#endif /* LV_HAVE_AVX */
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_sse2(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
// float* pointer1 = (float*)&phase_rad_init;
// *pointer1 = 0;
// float* pointer2 = (float*)&phase_step_rad;
// *pointer2 = 0.5;
const unsigned int sse_iters = num_points / 4;
__m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
__m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
__m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
__m128 _ps_sign_mask = _mm_set1_ps(-0.f);
__m128i _pi32_1 = _mm_set1_epi32(1);
__m128i _pi32_inv1 = _mm_set1_epi32(~1);
__m128i _pi32_2 = _mm_set1_epi32(2);
__m128i _pi32_4 = _mm_set1_epi32(4);
__m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / PI
__m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
__m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
__m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
__m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
__m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
__m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
__m128 _ps_1 = _mm_set1_ps(1.f);
__m128 _ps_0p5 = _mm_set1_ps(0.5f);
__m128 phase_step_rad_array = _mm_set1_ps(4*phase_step_rad);
__m128 phase_rad_array, x, s, c, swap_sign_bit_sin, sign_bit_cos, poly_mask, z, tmp, y, y2, ysin1, ysin2;
__m128 xmm1, xmm2, xmm3, sign_bit_sin;
__m128i emm0, emm2, emm4;
__VOLK_ATTR_ALIGNED(16) float sin_value[4];
__VOLK_ATTR_ALIGNED(16) float cos_value[4];
phase_rad_array = _mm_set_ps (phase_rad_init+3*phase_step_rad, phase_rad_init+2*phase_step_rad, phase_rad_init+phase_step_rad, phase_rad_init);
for(int i = 0; i < sse_iters; i++)
{
x = phase_rad_array;
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
/* take the absolute value */
x = _mm_xor_ps(x, sign_bit_sin);
/* scale by 4/Pi */
y = _mm_mul_ps(x, _ps_cephes_FOPI);
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, _pi32_1);
emm2 = _mm_and_si128(emm2, _pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, _pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, _pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
poly_mask = _mm_castsi128_ps(emm2);
/* The magic pass: "Extended precision modular arithmetic"
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
emm4 = _mm_sub_epi32(emm4, _pi32_2);
emm4 = _mm_andnot_si128(emm4, _pi32_4);
emm4 = _mm_slli_epi32(emm4, 29);
sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
z = _mm_mul_ps(x,x);
y = _ps_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, _ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, _ps_coscof_p2);
y = _mm_mul_ps(y, _mm_mul_ps(z, z));
tmp = _mm_mul_ps(z, _ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, _ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
y2 = _ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, _ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, _ps_sincof_p2);
y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
y2 = _mm_add_ps(y2, x);
/* select the correct result from the two polynoms */
xmm3 = poly_mask;
ysin2 = _mm_and_ps(xmm3, y2);
ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
/* update the sign */
s = _mm_xor_ps(xmm1, sign_bit_sin);
c = _mm_xor_ps(xmm2, sign_bit_cos);
//GNSS-SDR needs to return -sin
s = _mm_xor_ps(s, _ps_sign_mask);
_mm_store_ps ((float*)sin_value, s);
_mm_store_ps ((float*)cos_value, c);
for(int i = 0; i < 4; i++)
{
d_carr_sign[i] = lv_cmake(cos_value[i], sin_value[i]);
}
d_carr_sign += 4;
phase_rad_array = _mm_add_ps (phase_rad_array, phase_step_rad_array);
}
if (num_points%4!=0)
{
__VOLK_ATTR_ALIGNED(16) float phase_rad_store[4];
_mm_store_si128 ((__m128i*)phase_rad_store, phase_rad_array);
float phase_rad = phase_rad_store[0];
for(int i = 0; i < num_points%4; i++)
{
*d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
d_carr_sign++;
phase_rad += phase_step_rad;
}
}
}
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_GENERIC
/*!
\brief Accumulates the values in the input buffer
\param result The accumulated result
\param inputBuffer The buffer of data to be accumulated
\param num_points The number of values in inputBuffer to be accumulated
*/
static inline void volk_gnsssdr_s32f_x2_update_local_carrier_32fc_a_generic(lv_32fc_t* d_carr_sign, const float phase_rad_init, const float phase_step_rad, unsigned int num_points){
// float* pointer1 = (float*)&phase_rad_init;
// *pointer1 = 0;
// float* pointer2 = (float*)&phase_step_rad;
// *pointer2 = 0.5;
float phase_rad = phase_rad_init;
for(int i = 0; i < num_points; i++)
{
*d_carr_sign = lv_cmake(cos(phase_rad), -sin(phase_rad));
d_carr_sign++;
phase_rad += phase_step_rad;
}
}
#endif /* LV_HAVE_GENERIC */
#endif /* INCLUDED_volk_gnsssdr_32fc_s32f_x2_update_local_carrier_32fc_a_H */

View File

@ -0,0 +1,578 @@
#
# Copyright 2011-2012,2014 Free Software Foundation, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
########################################################################
# header file detection
########################################################################
include(CheckIncludeFile)
CHECK_INCLUDE_FILE(cpuid.h HAVE_CPUID_H)
if(HAVE_CPUID_H)
add_definitions(-DHAVE_CPUID_H)
endif()
CHECK_INCLUDE_FILE(intrin.h HAVE_INTRIN_H)
if(HAVE_INTRIN_H)
add_definitions(-DHAVE_INTRIN_H)
endif()
CHECK_INCLUDE_FILE(fenv.h HAVE_FENV_H)
if(HAVE_FENV_H)
add_definitions(-DHAVE_FENV_H)
endif()
CHECK_INCLUDE_FILE(dlfcn.h HAVE_DLFCN_H)
if(HAVE_DLFCN_H)
add_definitions(-DHAVE_DLFCN_H)
list(APPEND volk_gnsssdr_libraries ${CMAKE_DL_LIBS})
endif()
########################################################################
# Setup the compiler name
########################################################################
set(COMPILER_NAME ${CMAKE_C_COMPILER_ID})
if(MSVC) #its not set otherwise
set(COMPILER_NAME MSVC)
endif()
message(STATUS "Compiler name: ${COMPILER_NAME}")
if(NOT DEFINED COMPILER_NAME)
message(FATAL_ERROR "COMPILER_NAME undefined. Volk build may not support this compiler.")
endif()
########################################################################
# Special clang flag so flag checks can fail
########################################################################
if(COMPILER_NAME MATCHES "GNU")
include(CheckCXXCompilerFlag)
CHECK_CXX_COMPILER_FLAG("-Werror=unused-command-line-argument" HAVE_WERROR_UNUSED_CMD_LINE_ARG)
if(HAVE_WERROR_UNUSED_CMD_LINE_ARG)
set(VOLK_FLAG_CHECK_FLAGS "-Werror=unused-command-line-argument")
endif()
endif()
########################################################################
# check for posix_memalign, since some OSs do not internally define
# _XOPEN_SOURCE or _POSIX_C_SOURCE; they leave this to the user.
########################################################################
include(CheckFunctionExists)
CHECK_FUNCTION_EXISTS(posix_memalign HAVE_POSIX_MEMALIGN)
if(HAVE_POSIX_MEMALIGN)
add_definitions(-DHAVE_POSIX_MEMALIGN)
endif(HAVE_POSIX_MEMALIGN)
########################################################################
# detect x86 flavor of CPU
########################################################################
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(i.86|x86|x86_64|amd64)$")
message(STATUS "x86* CPU detected")
set(CPU_IS_x86 TRUE)
endif()
########################################################################
# determine passing architectures based on compile flag tests
########################################################################
execute_process(
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
${CMAKE_SOURCE_DIR}/gen/volk_gnsssdr_compile_utils.py
--mode "arch_flags" --compiler "${COMPILER_NAME}"
OUTPUT_VARIABLE arch_flag_lines OUTPUT_STRIP_TRAILING_WHITESPACE
)
macro(check_arch arch_name)
set(flags ${ARGN})
set(have_${arch_name} TRUE)
foreach(flag ${flags})
include(CheckCXXCompilerFlag)
set(have_flag have${flag})
execute_process( #make the have_flag have nice alphanum chars (just for looks/not necessary)
COMMAND ${PYTHON_EXECUTABLE} -c "import re; print(re.sub('\\W', '_', '${have_flag}'))"
OUTPUT_VARIABLE have_flag OUTPUT_STRIP_TRAILING_WHITESPACE
)
if(VOLK_FLAG_CHECK_FLAGS)
set(CMAKE_REQUIRED_FLAGS ${VOLK_FLAG_CHECK_FLAGS})
endif()
CHECK_CXX_COMPILER_FLAG(${flag} ${have_flag})
unset(CMAKE_REQUIRED_FLAGS)
if (NOT ${have_flag})
set(have_${arch_name} FALSE)
endif()
endforeach()
if (have_${arch_name})
list(APPEND available_archs ${arch_name})
endif()
endmacro(check_arch)
foreach(line ${arch_flag_lines})
string(REGEX REPLACE "," ";" arch_flags ${line})
check_arch(${arch_flags})
endforeach(line)
macro(OVERRULE_ARCH arch reason)
message(STATUS "${reason}, Overruled arch ${arch}")
list(REMOVE_ITEM available_archs ${arch})
endmacro(OVERRULE_ARCH)
########################################################################
# eliminate AVX on if not on x86, or if the compiler does not accept
# the xgetbv instruction, or {if not cross-compiling and the xgetbv
# executable does not function correctly}.
########################################################################
set(HAVE_XGETBV 0)
set(HAVE_AVX_CVTPI32_PS 0)
if(CPU_IS_x86)
# check to see if the compiler/linker works with xgetb instruction
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c "unsigned long long _xgetbv(unsigned int index) { unsigned int eax, edx; __asm__ __volatile__(\"xgetbv\" : \"=a\"(eax), \"=d\"(edx) : \"c\"(index)); return ((unsigned long long)edx << 32) | eax; } int main (void) { (void) _xgetbv(0); return (0); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -o
${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv
${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c
OUTPUT_QUIET ERROR_QUIET
RESULT_VARIABLE avx_compile_result)
if(NOT ${avx_compile_result} EQUAL 0)
OVERRULE_ARCH(avx "Compiler or linker missing xgetbv instruction")
elseif(NOT CROSSCOMPILE_MULTILIB)
execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv
OUTPUT_QUIET ERROR_QUIET
RESULT_VARIABLE avx_exe_result)
if(NOT ${avx_exe_result} EQUAL 0)
OVERRULE_ARCH(avx "CPU missing xgetbv")
else()
set(HAVE_XGETBV 1)
endif()
else()
# cross compiling and compiler/linker seems to work; assume working
set(HAVE_XGETBV 1)
endif()
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv
${CMAKE_CURRENT_BINARY_DIR}/test_xgetbv.c)
#########################################################################
# eliminate AVX if cvtpi32_ps intrinsic fails like some versions of clang
#########################################################################
# check to see if the compiler/linker works with cvtpi32_ps instrinsic when using AVX
file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c "#include <immintrin.h>\nint main (void) {__m128 __a; __m64 __b; __m128 foo = _mm_cvtpi32_ps(__a, __b); return (0); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -mavx -o
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c
OUTPUT_QUIET ERROR_QUIET
RESULT_VARIABLE avx_compile_result)
if(NOT ${avx_compile_result} EQUAL 0)
OVERRULE_ARCH(avx "Compiler missing cvtpi32_ps instrinsic")
elseif(NOT CROSSCOMPILE_MULTILIB)
execute_process(COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
OUTPUT_QUIET ERROR_QUIET
RESULT_VARIABLE avx_exe_result)
if(NOT ${avx_exe_result} EQUAL 0)
OVERRULE_ARCH(avx "CPU missing cvtpi32_ps")
else()
set(HAVE_AVX_CVTPI32_PS 1)
endif()
else()
set(HAVE_AVX_CVTPI32_PS 1)
endif()
file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps
${CMAKE_CURRENT_BINARY_DIR}/test_cvtpi32_ps.c)
# Disable SSE4a if Clang is less than version 3.2
if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
# Figure out the version of Clang
if(CMAKE_VERSION VERSION_LESS "2.8.10")
# Exctract the Clang version from the --version string.
# In cmake 2.8.10, we can just use CMAKE_C_COMPILER_VERSION
# without having to go through these string manipulations
execute_process(COMMAND ${CMAKE_C_COMPILER} --version
OUTPUT_VARIABLE clang_version)
string(REGEX MATCH "[0-9].[0-9]" CMAKE_C_COMPILER_VERSION ${clang_version})
endif(CMAKE_VERSION VERSION_LESS "2.8.10")
if(CMAKE_C_COMPILER_VERSION VERSION_LESS "3.2")
OVERRULE_ARCH(sse4_a "Clang >= 3.2 required for SSE4a")
endif(CMAKE_C_COMPILER_VERSION VERSION_LESS "3.2")
endif("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
endif(CPU_IS_x86)
if(${HAVE_XGETBV})
add_definitions(-DHAVE_XGETBV)
endif()
if(${HAVE_AVX_CVTPI32_PS})
add_definitions(-DHAVE_AVX_CVTPI32_PS)
endif()
########################################################################
# if the CPU is not x86, eliminate all Intel SIMD
########################################################################
if(NOT CPU_IS_x86)
OVERRULE_ARCH(3dnow "Architecture is not x86 or x86_64")
OVERRULE_ARCH(mmx "Architecture is not x86 or x86_64")
OVERRULE_ARCH(sse "Architecture is not x86 or x86_64")
OVERRULE_ARCH(sse2 "Architecture is not x86 or x86_64")
OVERRULE_ARCH(sse3 "Architecture is not x86 or x86_64")
OVERRULE_ARCH(ssse3 "Architecture is not x86 or x86_64")
OVERRULE_ARCH(sse4_a "Architecture is not x86 or x86_64")
OVERRULE_ARCH(sse4_1 "Architecture is not x86 or x86_64")
OVERRULE_ARCH(sse4_2 "Architecture is not x86 or x86_64")
OVERRULE_ARCH(avx "Architecture is not x86 or x86_64")
endif(NOT CPU_IS_x86)
########################################################################
# implement overruling in the ORC case,
# since ORC always passes flag detection
########################################################################
if(NOT ORC_FOUND)
OVERRULE_ARCH(orc "ORC support not found")
endif()
########################################################################
# implement overruling in the non-multilib case
# this makes things work when both -m32 and -m64 pass
########################################################################
if(NOT CROSSCOMPILE_MULTILIB AND CPU_IS_x86)
include(CheckTypeSize)
check_type_size("void*[8]" SIZEOF_CPU BUILTIN_TYPES_ONLY)
if (${SIZEOF_CPU} EQUAL 64)
OVERRULE_ARCH(32 "CPU width is 64 bits")
endif()
if (${SIZEOF_CPU} EQUAL 32)
OVERRULE_ARCH(64 "CPU width is 32 bits")
endif()
#MSVC 64 bit does not have MMX, overrule it
if (${SIZEOF_CPU} EQUAL 64 AND MSVC)
OVERRULE_ARCH(mmx "No MMX for Win64")
endif()
endif()
########################################################################
# done overrules! print the result
########################################################################
message(STATUS "Available architectures: ${available_archs}")
########################################################################
# determine available machines given the available architectures
########################################################################
execute_process(
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
${CMAKE_SOURCE_DIR}/gen/volk_gnsssdr_compile_utils.py
--mode "machines" --archs "${available_archs}"
OUTPUT_VARIABLE available_machines OUTPUT_STRIP_TRAILING_WHITESPACE
)
########################################################################
# Implement machine overruling for redundant machines:
# A machine is redundant when expansion rules occur,
# and the arch superset passes configuration checks.
# When this occurs, eliminate the redundant machines
# to avoid unnecessary compilation of subset machines.
########################################################################
foreach(arch mmx orc 64 32)
foreach(machine_name ${available_machines})
string(REPLACE "_${arch}" "" machine_name_no_arch ${machine_name})
if (${machine_name} STREQUAL ${machine_name_no_arch})
else()
list(REMOVE_ITEM available_machines ${machine_name_no_arch})
endif()
endforeach(machine_name)
endforeach(arch)
########################################################################
# done overrules! print the result
########################################################################
message(STATUS "Available machines: ${available_machines}")
########################################################################
# Create rules to run the volk_gnsssdr generator
########################################################################
#dependencies are all python, xml, and header implementation files
file(GLOB xml_files ${CMAKE_SOURCE_DIR}/gen/*.xml)
file(GLOB py_files ${CMAKE_SOURCE_DIR}/gen/*.py)
file(GLOB h_files ${CMAKE_SOURCE_DIR}/kernels/volk_gnsssdr/*.h)
macro(gen_template tmpl output)
list(APPEND volk_gnsssdr_gen_sources ${output})
add_custom_command(
OUTPUT ${output}
DEPENDS ${xml_files} ${py_files} ${h_files} ${tmpl}
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
${CMAKE_SOURCE_DIR}/gen/volk_gnsssdr_tmpl_utils.py
--input ${tmpl} --output ${output} ${ARGN}
)
endmacro(gen_template)
make_directory(${CMAKE_BINARY_DIR}/include/volk_gnsssdr)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr.tmpl.h ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr.h)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_gnsssdr.c)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_typedefs.tmpl.h ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_typedefs.h)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_cpu.tmpl.h ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_cpu.h)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_cpu.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_gnsssdr_cpu.c)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_config_fixed.tmpl.h ${CMAKE_BINARY_DIR}/include/volk_gnsssdr/volk_gnsssdr_config_fixed.h)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_machines.tmpl.h ${CMAKE_BINARY_DIR}/lib/volk_gnsssdr_machines.h)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_machines.tmpl.c ${CMAKE_BINARY_DIR}/lib/volk_gnsssdr_machines.c)
set(BASE_CFLAGS NONE)
STRING(TOUPPER ${CMAKE_BUILD_TYPE} CBTU)
MESSAGE(STATUS BUILT TYPE ${CBTU})
MESSAGE(STATUS "Base cflags = ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS}")
set(COMPILER_INFO "")
IF(MSVC)
IF(MSVC90) #Visual Studio 9
SET(cmake_c_compiler_version "Microsoft Visual Studio 9.0")
ELSE(MSVC10) #Visual Studio 10
SET(cmake_c_compiler_version "Microsoft Visual Studio 10.0")
ELSE(MSVC11) #Visual Studio 11
SET(cmake_c_compiler_version "Microsoft Visual Studio 11.0")
ELSE(MSVC12) #Visual Studio 12
SET(cmake_c_compiler_version "Microsoft Visual Studio 12.0")
ENDIF()
ELSE()
execute_process(COMMAND ${CMAKE_C_COMPILER} --version
OUTPUT_VARIABLE cmake_c_compiler_version)
ENDIF(MSVC)
set(COMPILER_INFO "${CMAKE_C_COMPILER}:::${CMAKE_C_FLAGS_${GRCBTU}} ${CMAKE_C_FLAGS}\n${CMAKE_CXX_COMPILER}:::${CMAKE_CXX_FLAGS_${GRCBTU}} ${CMAKE_CXX_FLAGS}\n" )
foreach(machine_name ${available_machines})
#generate machine source
set(machine_source ${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr_machine_${machine_name}.c)
gen_template(${CMAKE_SOURCE_DIR}/tmpl/volk_gnsssdr_machine_xxx.tmpl.c ${machine_source} ${machine_name})
#determine machine flags
execute_process(
COMMAND ${PYTHON_EXECUTABLE} ${PYTHON_DASH_B}
${CMAKE_SOURCE_DIR}/gen/volk_gnsssdr_compile_utils.py
--mode "machine_flags" --machine "${machine_name}" --compiler "${COMPILER_NAME}"
OUTPUT_VARIABLE ${machine_name}_flags OUTPUT_STRIP_TRAILING_WHITESPACE
)
MESSAGE(STATUS "BUILD INFO ::: ${machine_name} ::: ${COMPILER_NAME} ::: ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}")
set(COMPILER_INFO "${COMPILER_INFO}${machine_name}:::${COMPILER_NAME}:::${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}\n" )
if(${machine_name}_flags)
set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}")
endif()
#add to available machine defs
string(TOUPPER LV_MACHINE_${machine_name} machine_def)
list(APPEND machine_defs ${machine_def})
endforeach(machine_name)
# Convert to a C string to compile and display properly
string(STRIP "${cmake_c_compiler_version}" cmake_c_compiler_version)
string(STRIP ${COMPILER_INFO} COMPILER_INFO)
MESSAGE(STATUS "Compiler Version: ${cmake_c_compiler_version}")
string(REPLACE "\n" " \\n" cmake_c_compiler_version ${cmake_c_compiler_version})
string(REPLACE "\n" " \\n" COMPILER_INFO ${COMPILER_INFO})
########################################################################
# Set local include directories first
########################################################################
include_directories(
${CMAKE_BINARY_DIR}/include
${CMAKE_SOURCE_DIR}/include
${CMAKE_SOURCE_DIR}/kernels
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_SOURCE_DIR}
)
########################################################################
# Handle ASM support
# on by default, but let users turn it off
########################################################################
if(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
set(ASM_ARCHS_AVAILABLE "armv7")
set(FULL_C_FLAGS "${CMAKE_C_FLAGS}" "${CMAKE_CXX_COMPILER_ARG1}")
# sort through a list of all architectures we have ASM for
# if we find one that matches our current system architecture
# set up the assembler flags and include the source files
foreach(ARCH ${ASM_ARCHS_AVAILABLE})
string(REGEX MATCH "${ARCH}" ASM_ARCH "${FULL_C_FLAGS}")
if( ASM_ARCH STREQUAL "armv7" )
message(STATUS "---- Adding ASM files") # we always use ATT syntax
message(STATUS "-- Detected armv7 architecture; enabling ASM")
# setup architecture specific assembler flags
set(ARCH_ASM_FLAGS "-mfpu=neon -g")
# then add the files
include_directories(${CMAKE_SOURCE_DIR}/kernels/volk_gnsssdr/asm/neon)
file(GLOB asm_files ${CMAKE_SOURCE_DIR}/kernels/volk_gnsssdr/asm/neon/*.s)
foreach(asm_file ${asm_files})
list(APPEND volk_gnsssdr_sources ${asm_file})
message(STATUS "Adding source file: ${asm_file}")
endforeach(asm_file)
endif()
enable_language(ASM)
set(CMAKE_ASM_FLAGS ${ARCH_ASM_FLAGS})
message(STATUS "c flags: ${FULL_C_FLAGS}")
message(STATUS "asm flags: ${CMAKE_ASM_FLAGS}")
endforeach(ARCH)
else(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
message(STATUS "Not enabling ASM support. CMake >= 2.8.10 required.")
foreach(machine_name ${available_machines})
string(REGEX MATCH "neon" NEON_MACHINE ${machine_name})
if( NEON_MACHINE STREQUAL "neon")
message(FATAL_ERROR "CMake >= 2.8.10 is required for ARM NEON support")
endif()
endforeach()
endif(${CMAKE_VERSION} VERSION_GREATER "2.8.9")
########################################################################
# Handle orc support
########################################################################
if(ORC_FOUND)
#setup orc library usage
include_directories(${ORC_INCLUDE_DIRS})
link_directories(${ORC_LIBRARY_DIRS})
list(APPEND volk_gnsssdr_libraries ${ORC_LIBRARIES})
#setup orc functions
file(GLOB orc_files ${CMAKE_SOURCE_DIR}/orc/*.orc)
foreach(orc_file ${orc_files})
#extract the name for the generated c source from the orc file
get_filename_component(orc_file_name_we ${orc_file} NAME_WE)
set(orcc_gen ${CMAKE_CURRENT_BINARY_DIR}/${orc_file_name_we}.c)
#create a rule to generate the source and add to the list of sources
add_custom_command(
COMMAND ${ORCC_EXECUTABLE} --include math.h --implementation -o ${orcc_gen} ${orc_file}
DEPENDS ${orc_file} OUTPUT ${orcc_gen}
)
list(APPEND volk_gnsssdr_sources ${orcc_gen})
endforeach(orc_file)
else()
message(STATUS "Did not find liborc and orcc, disabling orc support...")
endif()
########################################################################
# Handle the generated constants
########################################################################
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c
"import time;print time.strftime('%a, %d %b %Y %H:%M:%S', time.gmtime())"
OUTPUT_VARIABLE BUILD_DATE OUTPUT_STRIP_TRAILING_WHITESPACE
)
message(STATUS "Loading build date ${BUILD_DATE} into constants...")
message(STATUS "Loading version ${VERSION} into constants...")
#double escape for windows backslash path separators
string(REPLACE "\\" "\\\\" prefix ${prefix})
configure_file(
${CMAKE_CURRENT_SOURCE_DIR}/constants.c.in
${CMAKE_CURRENT_BINARY_DIR}/constants.c
@ONLY)
list(APPEND volk_gnsssdr_sources ${CMAKE_CURRENT_BINARY_DIR}/constants.c)
########################################################################
# Setup the volk_gnsssdr sources list and library
########################################################################
if(NOT WIN32)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
endif()
list(APPEND volk_gnsssdr_sources
${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr_prefs.c
${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr_rank_archs.c
${CMAKE_CURRENT_SOURCE_DIR}/volk_gnsssdr_malloc.c
${volk_gnsssdr_gen_sources}
)
#set the machine definitions where applicable
set_source_files_properties(
${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr.c
${CMAKE_CURRENT_BINARY_DIR}/volk_gnsssdr_machines.c
PROPERTIES COMPILE_DEFINITIONS "${machine_defs}")
if(MSVC)
#add compatibility includes for stdint types
include_directories(${CMAKE_SOURCE_DIR}/cmake/msvc)
add_definitions(-DHAVE_CONFIG_H)
#compile the sources as C++ due to the lack of complex.h under MSVC
set_source_files_properties(${volk_gnsssdr_sources} PROPERTIES LANGUAGE CXX)
endif()
#create the volk_gnsssdr runtime library
#MODIFICATIONS BY GNSS-SDR
file(GLOB orc ${CMAKE_SOURCE_DIR}/orc/*.orc)
file(GLOB CommonMacros ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/*.h ${CMAKE_SOURCE_DIR}/kernels/CommonMacros/README.txt)
#add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources})
add_library(volk_gnsssdr SHARED ${volk_gnsssdr_sources} ${h_files} ${CommonMacros} ${orc})
source_group("Kernels" FILES ${h_files})
source_group("Common Macros" FILES ${CommonMacros})
source_group("ORC Files" FILES ${orc})
#END OF MODIFICATIONS
target_link_libraries(volk_gnsssdr ${volk_gnsssdr_libraries})
set_target_properties(volk_gnsssdr PROPERTIES SOVERSION ${LIBVER})
set_target_properties(volk_gnsssdr PROPERTIES DEFINE_SYMBOL "volk_gnsssdr_EXPORTS")
install(TARGETS volk_gnsssdr
LIBRARY DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_runtime" # .so file
ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_devel" # .lib file
RUNTIME DESTINATION bin COMPONENT "volk_gnsssdr_runtime" # .dll file
)
if(ENABLE_STATIC_LIBS)
add_library(volk_gnsssdr_static STATIC ${volk_gnsssdr_sources})
if(NOT WIN32)
set_target_properties(volk_gnsssdr_static
PROPERTIES OUTPUT_NAME volk_gnsssdr)
endif(NOT WIN32)
install(TARGETS volk_gnsssdr_static
ARCHIVE DESTINATION lib${LIB_SUFFIX} COMPONENT "volk_gnsssdr_devel" # .lib file
)
endif(ENABLE_STATIC_LIBS)
########################################################################
# Build the QA test application
########################################################################
if(Boost_FOUND)
set_source_files_properties(
${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc PROPERTIES
COMPILE_DEFINITIONS "BOOST_TEST_DYN_LINK;BOOST_TEST_MAIN"
)
include_directories(${Boost_INCLUDE_DIRS})
link_directories(${Boost_LIBRARY_DIRS})
add_executable(test_all
${CMAKE_CURRENT_SOURCE_DIR}/testqa.cc
${CMAKE_CURRENT_SOURCE_DIR}/qa_utils.cc
)
target_link_libraries(test_all volk_gnsssdr ${Boost_LIBRARIES})
add_test(qa_volk_gnsssdr_test_all test_all)
endif(Boost_FOUND)

View File

@ -0,0 +1,63 @@
/* -*- c++ -*- */
/*
* Copyright 2013 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#if HAVE_CONFIG_H
#include <config.h>
#endif
#include <volk_gnsssdr/constants.h>
char*
volk_gnsssdr_prefix()
{
return "@prefix@";
}
char*
volk_gnsssdr_build_date()
{
return "@BUILD_DATE@";
}
char*
volk_gnsssdr_version()
{
return "@VERSION@";
}
char*
volk_gnsssdr_c_compiler()
{
return "@cmake_c_compiler_version@";
}
char*
volk_gnsssdr_compiler_flags()
{
return "@COMPILER_INFO@";
}
char*
volk_gnsssdr_available_machines()
{
return "@available_machines@";
}

View File

@ -0,0 +1,188 @@
/*
* Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
*
* This file is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 3, or (at your option) any
* later version.
*
* This file is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* Under Section 7 of GPL version 3, you are granted additional
* permissions described in the GCC Runtime Library Exception, version
* 3.1, as published by the Free Software Foundation.
*
* You should have received a copy of the GNU General Public License and
* a copy of the GCC Runtime Library Exception along with this program;
* see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
* <http://www.gnu.org/licenses/>.
*/
/* %ecx */
#define bit_SSE3 (1 << 0)
#define bit_PCLMUL (1 << 1)
#define bit_SSSE3 (1 << 9)
#define bit_FMA (1 << 12)
#define bit_CMPXCHG16B (1 << 13)
#define bit_SSE4_1 (1 << 19)
#define bit_SSE4_2 (1 << 20)
#define bit_MOVBE (1 << 22)
#define bit_POPCNT (1 << 23)
#define bit_AES (1 << 25)
#define bit_XSAVE (1 << 26)
#define bit_OSXSAVE (1 << 27)
#define bit_AVX (1 << 28)
#define bit_F16C (1 << 29)
#define bit_RDRND (1 << 30)
/* %edx */
#define bit_CMPXCHG8B (1 << 8)
#define bit_CMOV (1 << 15)
#define bit_MMX (1 << 23)
#define bit_FXSAVE (1 << 24)
#define bit_SSE (1 << 25)
#define bit_SSE2 (1 << 26)
/* Extended Features */
/* %ecx */
#define bit_LAHF_LM (1 << 0)
#define bit_ABM (1 << 5)
#define bit_SSE4a (1 << 6)
#define bit_XOP (1 << 11)
#define bit_LWP (1 << 15)
#define bit_FMA4 (1 << 16)
#define bit_TBM (1 << 21)
/* %edx */
#define bit_MMXEXT (1 << 22)
#define bit_LM (1 << 29)
#define bit_3DNOWP (1 << 30)
#define bit_3DNOW (1 << 31)
/* Extended Features (%eax == 7) */
#define bit_FSGSBASE (1 << 0)
#define bit_BMI (1 << 3)
#if defined(__i386__) && defined(__PIC__)
/* %ebx may be the PIC register. */
#if __GNUC__ >= 3
#define __cpuid(level, a, b, c, d) \
__asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \
"cpuid\n\t" \
"xchg{l}\t{%%}ebx, %1\n\t" \
: "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
: "0" (level))
#define __cpuid_count(level, count, a, b, c, d) \
__asm__ ("xchg{l}\t{%%}ebx, %1\n\t" \
"cpuid\n\t" \
"xchg{l}\t{%%}ebx, %1\n\t" \
: "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
: "0" (level), "2" (count))
#else
/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
nor alternatives in i386 code. */
#define __cpuid(level, a, b, c, d) \
__asm__ ("xchgl\t%%ebx, %1\n\t" \
"cpuid\n\t" \
"xchgl\t%%ebx, %1\n\t" \
: "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
: "0" (level))
#define __cpuid_count(level, count, a, b, c, d) \
__asm__ ("xchgl\t%%ebx, %1\n\t" \
"cpuid\n\t" \
"xchgl\t%%ebx, %1\n\t" \
: "=a" (a), "=r" (b), "=c" (c), "=d" (d) \
: "0" (level), "2" (count))
#endif
#else
#define __cpuid(level, a, b, c, d) \
__asm__ ("cpuid\n\t" \
: "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
: "0" (level))
#define __cpuid_count(level, count, a, b, c, d) \
__asm__ ("cpuid\n\t" \
: "=a" (a), "=b" (b), "=c" (c), "=d" (d) \
: "0" (level), "2" (count))
#endif
/* Return highest supported input value for cpuid instruction. ext can
be either 0x0 or 0x8000000 to return highest supported value for
basic or extended cpuid information. Function returns 0 if cpuid
is not supported or whatever cpuid returns in eax register. If sig
pointer is non-null, then first four bytes of the signature
(as found in ebx register) are returned in location pointed by sig. */
static __inline unsigned int
__get_cpuid_max (unsigned int __ext, unsigned int *__sig)
{
unsigned int __eax, __ebx, __ecx, __edx;
#ifndef __x86_64__
/* See if we can use cpuid. On AMD64 we always can. */
#if __GNUC__ >= 3
__asm__ ("pushf{l|d}\n\t"
"pushf{l|d}\n\t"
"pop{l}\t%0\n\t"
"mov{l}\t{%0, %1|%1, %0}\n\t"
"xor{l}\t{%2, %0|%0, %2}\n\t"
"push{l}\t%0\n\t"
"popf{l|d}\n\t"
"pushf{l|d}\n\t"
"pop{l}\t%0\n\t"
"popf{l|d}\n\t"
: "=&r" (__eax), "=&r" (__ebx)
: "i" (0x00200000));
#else
/* Host GCCs older than 3.0 weren't supporting Intel asm syntax
nor alternatives in i386 code. */
__asm__ ("pushfl\n\t"
"pushfl\n\t"
"popl\t%0\n\t"
"movl\t%0, %1\n\t"
"xorl\t%2, %0\n\t"
"pushl\t%0\n\t"
"popfl\n\t"
"pushfl\n\t"
"popl\t%0\n\t"
"popfl\n\t"
: "=&r" (__eax), "=&r" (__ebx)
: "i" (0x00200000));
#endif
if (!((__eax ^ __ebx) & 0x00200000))
return 0;
#endif
/* Host supports cpuid. Return highest supported cpuid input value. */
__cpuid (__ext, __eax, __ebx, __ecx, __edx);
if (__sig)
*__sig = __ebx;
return __eax;
}
/* Return cpuid data for requested cpuid level, as found in returned
eax, ebx, ecx and edx registers. The function checks if cpuid is
supported and returns 1 for valid cpuid information or 0 for
unsupported cpuid level. All pointers are required to be non-null. */
static __inline int
__get_cpuid (unsigned int __level,
unsigned int *__eax, unsigned int *__ebx,
unsigned int *__ecx, unsigned int *__edx)
{
unsigned int __ext = __level & 0x80000000;
if (__get_cpuid_max (__ext, 0) < __level)
return 0;
__cpuid (__level, *__eax, *__ebx, *__ecx, *__edx);
return 1;
}

View File

@ -0,0 +1,89 @@
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <qa_16s_add_quad_aligned16.h>
#include <volk_gnsssdr/volk_gnsssdr_16s_add_quad_aligned16.h>
#include <cstdlib>
#include <ctime>
//test for sse2
#ifndef LV_HAVE_SSE2
void qa_16s_add_quad_aligned16::t1() {
printf("sse2 not available... no test performed\n");
}
#else
void qa_16s_add_quad_aligned16::t1() {
volk_gnsssdr_environment_init();
clock_t start, end;
double total;
const int vlen = 3200;
const int ITERS = 100000;
__VOLK_ATTR_ALIGNED(16) short input0[vlen];
__VOLK_ATTR_ALIGNED(16) short input1[vlen];
__VOLK_ATTR_ALIGNED(16) short input2[vlen];
__VOLK_ATTR_ALIGNED(16) short input3[vlen];
__VOLK_ATTR_ALIGNED(16) short input4[vlen];
__VOLK_ATTR_ALIGNED(16) short output0[vlen];
__VOLK_ATTR_ALIGNED(16) short output1[vlen];
__VOLK_ATTR_ALIGNED(16) short output2[vlen];
__VOLK_ATTR_ALIGNED(16) short output3[vlen];
__VOLK_ATTR_ALIGNED(16) short output01[vlen];
__VOLK_ATTR_ALIGNED(16) short output11[vlen];
__VOLK_ATTR_ALIGNED(16) short output21[vlen];
__VOLK_ATTR_ALIGNED(16) short output31[vlen];
for(int i = 0; i < vlen; ++i) {
short plus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short minus0 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short plus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short minus1 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short plus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short minus2 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short plus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short minus3 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short plus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
short minus4 = ((short) (rand() - (RAND_MAX/2))) >> 2;
input0[i] = plus0 - minus0;
input1[i] = plus1 - minus1;
input2[i] = plus2 - minus2;
input3[i] = plus3 - minus3;
input4[i] = plus4 - minus4;
}
printf("16s_add_quad_aligned\n");
start = clock();
for(int count = 0; count < ITERS; ++count) {
volk_gnsssdr_16s_add_quad_aligned16_manual(output0, output1, output2, output3, input0, input1, input2, input3, input4, vlen << 1 , "generic");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic_time: %f\n", total);
start = clock();
for(int count = 0; count < ITERS; ++count) {
volk_gnsssdr_16s_add_quad_aligned16_manual(output01, output11, output21, output31, input0, input1, input2, input3, input4, vlen << 1 , "sse2");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse2_time: %f\n", total);
for(int i = 0; i < 1; ++i) {
//printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
//printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
}
for(int i = 0; i < vlen; ++i) {
//printf("%d...%d\n", output0[i], output01[i]);
CPPUNIT_ASSERT_EQUAL(output0[i], output01[i]);
CPPUNIT_ASSERT_EQUAL(output1[i], output11[i]);
CPPUNIT_ASSERT_EQUAL(output2[i], output21[i]);
CPPUNIT_ASSERT_EQUAL(output3[i], output31[i]);
}
}
#endif

View File

@ -0,0 +1,18 @@
#ifndef INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
#define INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestCase.h>
class qa_16s_add_quad_aligned16 : public CppUnit::TestCase {
CPPUNIT_TEST_SUITE (qa_16s_add_quad_aligned16);
CPPUNIT_TEST (t1);
CPPUNIT_TEST_SUITE_END ();
private:
void t1 ();
};
#endif /* INCLUDED_QA_16S_ADD_QUAD_ALIGNED16_H */

View File

@ -0,0 +1,106 @@
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <qa_16s_branch_4_state_8_aligned16.h>
#include <cstdlib>
#include <ctime>
//test for ssse3
#ifndef LV_HAVE_SSSE3
void qa_16s_branch_4_state_8_aligned16::t1() {
printf("ssse3 not available... no test performed\n");
}
#else
void qa_16s_branch_4_state_8_aligned16::t1() {
const int num_iters = 1000000;
const int vlen = 32;
static char permute0[16]__attribute__((aligned(16))) = {0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01, 0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03};
static char permute1[16]__attribute__((aligned(16))) = {0x0c, 0x0d, 0x08, 0x09, 0x06, 0x07, 0x02, 0x03, 0x0e, 0x0f, 0x0a, 0x0b, 0x04, 0x05, 0x00, 0x01};
static char permute2[16]__attribute__((aligned(16))) = {0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d, 0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f};
static char permute3[16]__attribute__((aligned(16))) = {0x00, 0x01, 0x04, 0x05, 0x0a, 0x0b, 0x0e, 0x0f, 0x02, 0x03, 0x06, 0x07, 0x08, 0x09, 0x0c, 0x0d};
static char* permuters[4] = {permute0, permute1, permute2, permute3};
unsigned int num_bytes = vlen << 1;
volk_gnsssdr_environment_init();
clock_t start, end;
double total;
__VOLK_ATTR_ALIGNED(16) short target[vlen];
__VOLK_ATTR_ALIGNED(16) short target2[vlen];
__VOLK_ATTR_ALIGNED(16) short target3[vlen];
__VOLK_ATTR_ALIGNED(16) short src0[vlen];
__VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen] = {
7, 5, 2, 0, 6, 4, 3, 1, 6, 4, 3, 1, 7, 5, 2, 0, 1, 3, 4, 6, 0, 2, 5, 7, 0, 2, 5, 7, 1, 3, 4, 6 };
__VOLK_ATTR_ALIGNED(16) short cntl0[vlen] = {
0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
__VOLK_ATTR_ALIGNED(16) short cntl1[vlen] = {
0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 };
__VOLK_ATTR_ALIGNED(16) short cntl2[vlen] = {
0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000 };
__VOLK_ATTR_ALIGNED(16) short cntl3[vlen] = {
0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, 0xffff, 0xffff, 0x0000, 0x0000, 0xffff, 0xffff };
__VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4};
for(int i = 0; i < vlen; ++i) {
src0[i] = i;
}
printf("16s_branch_4_state_8_aligned\n");
start = clock();
for(int i = 0; i < num_iters; ++i) {
volk_gnsssdr_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("permute_and_scalar_add_time: %f\n", total);
start = clock();
for(int i = 0; i < num_iters; ++i) {
volk_gnsssdr_16s_branch_4_state_8_aligned16_manual(target2, src0, permuters, cntl2, cntl3, scalars, "ssse3");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("branch_4_state_8_time, ssse3: %f\n", total);
start = clock();
for(int i = 0; i < num_iters; ++i) {
volk_gnsssdr_16s_branch_4_state_8_aligned16_manual(target3, src0, permuters, cntl2, cntl3, scalars, "generic");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("permute_and_scalar_add_time, generic: %f\n", total);
for(int i = 0; i < vlen; ++i) {
printf("psa... %d, b4s8... %d\n", target[i], target3[i]);
}
for(int i = 0; i < vlen; ++i) {
CPPUNIT_ASSERT(target[i] == target2[i]);
CPPUNIT_ASSERT(target[i] == target3[i]);
}
}
#endif

View File

@ -0,0 +1,18 @@
#ifndef INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
#define INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestCase.h>
class qa_16s_branch_4_state_8_aligned16 : public CppUnit::TestCase {
CPPUNIT_TEST_SUITE (qa_16s_branch_4_state_8_aligned16);
CPPUNIT_TEST (t1);
CPPUNIT_TEST_SUITE_END ();
private:
void t1 ();
};
#endif /* INCLUDED_QA_16S_BRANCH_4_STATE_8_ALIGNED16_H */

View File

@ -0,0 +1,78 @@
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <qa_16s_permute_and_scalar_add_aligned16.h>
#include <volk_gnsssdr/volk_gnsssdr_16s_permute_and_scalar_add_aligned16.h>
#include <cstdlib>
#include <ctime>
//test for sse2
#ifndef LV_HAVE_SSE2
void qa_16s_permute_and_scalar_add_aligned16::t1() {
printf("sse2 not available... no test performed\n");
}
#else
void qa_16s_permute_and_scalar_add_aligned16::t1() {
const int vlen = 64;
unsigned int num_bytes = vlen << 1;
volk_gnsssdr_environment_init();
clock_t start, end;
double total;
__VOLK_ATTR_ALIGNED(16) short target[vlen];
__VOLK_ATTR_ALIGNED(16) short target2[vlen];
__VOLK_ATTR_ALIGNED(16) short src0[vlen];
__VOLK_ATTR_ALIGNED(16) short permute_indexes[vlen];
__VOLK_ATTR_ALIGNED(16) short cntl0[vlen];
__VOLK_ATTR_ALIGNED(16) short cntl1[vlen];
__VOLK_ATTR_ALIGNED(16) short cntl2[vlen];
__VOLK_ATTR_ALIGNED(16) short cntl3[vlen];
__VOLK_ATTR_ALIGNED(16) short scalars[4] = {1, 2, 3, 4};
for(int i = 0; i < vlen; ++i) {
src0[i] = i;
permute_indexes[i] = (3 * i)%vlen;
cntl0[i] = 0xff;
cntl1[i] = 0xff * (i%2);
cntl2[i] = 0xff * ((i>>1)%2);
cntl3[i] = 0xff * ((i%4) == 3);
}
printf("16s_permute_and_scalar_add_aligned\n");
start = clock();
for(int i = 0; i < 100000; ++i) {
volk_gnsssdr_16s_permute_and_scalar_add_aligned16_manual(target, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "generic");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic_time: %f\n", total);
start = clock();
for(int i = 0; i < 100000; ++i) {
volk_gnsssdr_16s_permute_and_scalar_add_aligned16_manual(target2, src0, permute_indexes, cntl0, cntl1, cntl2, cntl3, scalars, num_bytes, "sse2");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse2_time: %f\n", total);
for(int i = 0; i < vlen; ++i) {
//printf("generic... %d, sse2... %d\n", target[i], target2[i]);
}
for(int i = 0; i < vlen; ++i) {
CPPUNIT_ASSERT(target[i] == target2[i]);
}
}
#endif

View File

@ -0,0 +1,18 @@
#ifndef INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
#define INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestCase.h>
class qa_16s_permute_and_scalar_add_aligned16 : public CppUnit::TestCase {
CPPUNIT_TEST_SUITE (qa_16s_permute_and_scalar_add_aligned16);
CPPUNIT_TEST (t1);
CPPUNIT_TEST_SUITE_END ();
private:
void t1 ();
};
#endif /* INCLUDED_QA_16S_PERMUTE_AND_SCALAR_ADD_ALIGNED16_H */

View File

@ -0,0 +1,60 @@
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <qa_16s_quad_max_star_aligned16.h>
#include <volk_gnsssdr/volk_gnsssdr_16s_quad_max_star_aligned16.h>
#include <cstdlib>
#include <ctime>
//test for sse2
#ifndef LV_HAVE_SSE2
void qa_16s_quad_max_star_aligned16::t1() {
printf("sse2 not available... no test performed\n");
}
#else
void qa_16s_quad_max_star_aligned16::t1() {
const int vlen = 34;
__VOLK_ATTR_ALIGNED(16) short input0[vlen];
__VOLK_ATTR_ALIGNED(16) short input1[vlen];
__VOLK_ATTR_ALIGNED(16) short input2[vlen];
__VOLK_ATTR_ALIGNED(16) short input3[vlen];
__VOLK_ATTR_ALIGNED(16) short output0[vlen];
__VOLK_ATTR_ALIGNED(16) short output1[vlen];
for(int i = 0; i < vlen; ++i) {
short plus0 = (short) (rand() - (RAND_MAX/2));
short plus1 = (short) (rand() - (RAND_MAX/2));
short plus2 = (short) (rand() - (RAND_MAX/2));
short plus3 = (short) (rand() - (RAND_MAX/2));
short minus0 = (short) (rand() - (RAND_MAX/2));
short minus1 = (short) (rand() - (RAND_MAX/2));
short minus2 = (short) (rand() - (RAND_MAX/2));
short minus3 = (short) (rand() - (RAND_MAX/2));
input0[i] = plus0 - minus0;
input1[i] = plus1 - minus1;
input2[i] = plus2 - minus2;
input3[i] = plus3 - minus3;
}
volk_gnsssdr_16s_quad_max_star_aligned16_manual(output0, input0, input1, input2, input3, 2*vlen, "generic");
volk_gnsssdr_16s_quad_max_star_aligned16_manual(output1, input0, input1, input2, input3, 2*vlen, "sse2");
printf("16s_quad_max_star_aligned\n");
for(int i = 0; i < vlen; ++i) {
printf("generic... %d, sse2... %d, inputs: %d, %d, %d, %d\n", output0[i], output1[i], input0[i], input1[i], input2[i], input3[i]);
}
for(int i = 0; i < vlen; ++i) {
CPPUNIT_ASSERT_EQUAL(output0[i], output1[i]);
}
}
#endif

View File

@ -0,0 +1,18 @@
#ifndef INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
#define INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestCase.h>
class qa_16s_quad_max_star_aligned16 : public CppUnit::TestCase {
CPPUNIT_TEST_SUITE (qa_16s_quad_max_star_aligned16);
CPPUNIT_TEST (t1);
CPPUNIT_TEST_SUITE_END ();
private:
void t1 ();
};
#endif /* INCLUDED_QA_16S_QUAD_MAX_STAR_ALIGNED16_H */

View File

@ -0,0 +1,61 @@
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <qa_32f_fm_detect_aligned16.h>
#include <volk_gnsssdr/volk_gnsssdr_32f_fm_detect_aligned16.h>
#include <cstdlib>
#include <ctime>
//test for sse
#ifndef LV_HAVE_SSE
void qa_32f_fm_detect_aligned16::t1() {
printf("sse not available... no test performed\n");
}
#else
void qa_32f_fm_detect_aligned16::t1() {
volk_gnsssdr_environment_init();
clock_t start, end;
double total;
const int vlen = 3201;
const int ITERS = 10000;
__VOLK_ATTR_ALIGNED(16) float input0[vlen];
__VOLK_ATTR_ALIGNED(16) float output0[vlen];
__VOLK_ATTR_ALIGNED(16) float output01[vlen];
for(int i = 0; i < vlen; ++i) {
input0[i] = ((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2));
}
printf("32f_fm_detect_aligned\n");
start = clock();
float save = 0.1;
for(int count = 0; count < ITERS; ++count) {
volk_gnsssdr_32f_fm_detect_aligned16_manual(output0, input0, 1.0, &save, vlen, "generic");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic_time: %f\n", total);
start = clock();
save = 0.1;
for(int count = 0; count < ITERS; ++count) {
volk_gnsssdr_32f_fm_detect_aligned16_manual(output01, input0, 1.0, &save, vlen, "sse");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse_time: %f\n", total);
for(int i = 0; i < 1; ++i) {
//printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
//printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
}
for(int i = 0; i < vlen; ++i) {
//printf("%d...%d\n", output0[i], output01[i]);
CPPUNIT_ASSERT_DOUBLES_EQUAL(output0[i], output01[i], fabs(output0[i]) * 1e-4);
}
}
#endif

View File

@ -0,0 +1,18 @@
#ifndef INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
#define INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestCase.h>
class qa_32f_fm_detect_aligned16 : public CppUnit::TestCase {
CPPUNIT_TEST_SUITE (qa_32f_fm_detect_aligned16);
CPPUNIT_TEST (t1);
CPPUNIT_TEST_SUITE_END ();
private:
void t1 ();
};
#endif /* INCLUDED_QA_32F_FM_DETECT_ALIGNED16_H */

View File

@ -0,0 +1,103 @@
#include <volk_gnsssdr/volk_gnsssdr_runtime.h>
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <qa_32f_index_max_aligned16.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ERR_DELTA (1e-4)
#define NUM_ITERS 1000000
#define VEC_LEN 3097
static float uniform() {
return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
}
static void
random_floats (float *buf, unsigned n)
{
unsigned int i = 0;
for (; i < n; i++) {
buf[i] = uniform () * 32767;
}
}
#ifndef LV_HAVE_SSE
void qa_32f_index_max_aligned16::t1(){
printf("sse not available... no test performed\n");
}
#else
void qa_32f_index_max_aligned16::t1(){
const int vlen = VEC_LEN;
volk_gnsssdr_runtime_init();
volk_gnsssdr_environment_init();
int ret;
unsigned int* target_sse4_1;
unsigned int* target_sse;
unsigned int* target_generic;
float* src0 ;
unsigned int i_target_sse4_1;
target_sse4_1 = &i_target_sse4_1;
unsigned int i_target_sse;
target_sse = &i_target_sse;
unsigned int i_target_generic;
target_generic = &i_target_generic;
ret = posix_memalign((void**)&src0, 16, vlen *sizeof(float));
random_floats((float*)src0, vlen);
printf("32f_index_max_aligned16\n");
clock_t start, end;
double total;
start = clock();
for(int k = 0; k < NUM_ITERS; ++k) {
volk_gnsssdr_32f_index_max_aligned16_manual(target_generic, src0, vlen, "generic");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic time: %f\n", total);
start = clock();
for(int k = 0; k < NUM_ITERS; ++k) {
volk_gnsssdr_32f_index_max_aligned16_manual(target_sse, src0, vlen, "sse2");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse time: %f\n", total);
start = clock();
for(int k = 0; k < NUM_ITERS; ++k) {
get_volk_gnsssdr_runtime()->volk_gnsssdr_32f_index_max_aligned16(target_sse4_1, src0, vlen);
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse4.1 time: %f\n", total);
printf("generic: %u, sse: %u, sse4.1: %u\n", target_generic[0], target_sse[0], target_sse4_1[0]);
CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse[0]);
CPPUNIT_ASSERT_EQUAL(target_generic[0], target_sse4_1[0]);
free(src0);
}
#endif /*LV_HAVE_SSE3*/

View File

@ -0,0 +1,18 @@
#ifndef INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
#define INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestCase.h>
class qa_32f_index_max_aligned16 : public CppUnit::TestCase {
CPPUNIT_TEST_SUITE (qa_32f_index_max_aligned16);
CPPUNIT_TEST (t1);
CPPUNIT_TEST_SUITE_END ();
private:
void t1 ();
};
#endif /* INCLUDED_QA_32F_INDEX_MAX_ALIGNED16_H */

View File

@ -0,0 +1,89 @@
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <qa_32fc_index_max_aligned16.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ERR_DELTA (1e-4)
#define NUM_ITERS 1000000
#define VEC_LEN 3096
static float uniform() {
return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
}
static void
random_floats (float *buf, unsigned n)
{
unsigned int i = 0;
for (; i < n; i++) {
buf[i] = uniform () * 32767;
}
}
#ifndef LV_HAVE_SSE3
void qa_32fc_index_max_aligned16::t1(){
printf("sse3 not available... no test performed\n");
}
#else
void qa_32fc_index_max_aligned16::t1(){
const int vlen = VEC_LEN;
volk_gnsssdr_environment_init();
int ret;
unsigned int* target;
unsigned int* target_generic;
std::complex<float>* src0 ;
unsigned int i_target;
target = &i_target;
unsigned int i_target_generic;
target_generic = &i_target_generic;
ret = posix_memalign((void**)&src0, 16, vlen << 3);
random_floats((float*)src0, vlen * 2);
printf("32fc_index_max_aligned16\n");
clock_t start, end;
double total;
start = clock();
for(int k = 0; k < NUM_ITERS; ++k) {
volk_gnsssdr_32fc_index_max_aligned16_manual(target_generic, src0, vlen << 3, "generic");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic time: %f\n", total);
start = clock();
for(int k = 0; k < NUM_ITERS; ++k) {
volk_gnsssdr_32fc_index_max_aligned16_manual(target, src0, vlen << 3, "sse3");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse3 time: %f\n", total);
printf("generic: %u, sse3: %u\n", target_generic[0], target[0]);
CPPUNIT_ASSERT_DOUBLES_EQUAL(target_generic[0], target[0], 1.1);
free(src0);
}
#endif /*LV_HAVE_SSE3*/

View File

@ -0,0 +1,18 @@
#ifndef INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
#define INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestCase.h>
class qa_32fc_index_max_aligned16 : public CppUnit::TestCase {
CPPUNIT_TEST_SUITE (qa_32fc_index_max_aligned16);
CPPUNIT_TEST (t1);
CPPUNIT_TEST_SUITE_END ();
private:
void t1 ();
};
#endif /* INCLUDED_QA_32FC_INDEX_MAX_ALIGNED16_H */

View File

@ -0,0 +1,64 @@
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <qa_32fc_power_spectral_density_32f_aligned16.h>
#include <volk_gnsssdr/volk_gnsssdr_32fc_power_spectral_density_32f_aligned16.h>
#include <cstdlib>
#include <ctime>
//test for sse3
#ifndef LV_HAVE_SSE3
void qa_32fc_power_spectral_density_32f_aligned16::t1() {
printf("sse3 not available... no test performed\n");
}
#else
void qa_32fc_power_spectral_density_32f_aligned16::t1() {
volk_gnsssdr_environment_init();
clock_t start, end;
double total;
const int vlen = 3201;
const int ITERS = 10000;
__VOLK_ATTR_ALIGNED(16) std::complex<float> input0[vlen];
__VOLK_ATTR_ALIGNED(16) float output_generic[vlen];
__VOLK_ATTR_ALIGNED(16) float output_sse3[vlen];
const float scalar = vlen;
const float rbw = 1.7;
float* inputLoad = (float*)input0;
for(int i = 0; i < 2*vlen; ++i) {
inputLoad[i] = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2)));
}
printf("32fc_power_spectral_density_32f_aligned\n");
start = clock();
for(int count = 0; count < ITERS; ++count) {
volk_gnsssdr_32fc_power_spectral_density_32f_aligned16_manual(output_generic, input0, scalar, rbw, vlen, "generic");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("generic_time: %f\n", total);
start = clock();
for(int count = 0; count < ITERS; ++count) {
volk_gnsssdr_32fc_power_spectral_density_32f_aligned16_manual(output_sse3, input0, scalar, rbw, vlen, "sse3");
}
end = clock();
total = (double)(end-start)/(double)CLOCKS_PER_SEC;
printf("sse3_time: %f\n", total);
for(int i = 0; i < 1; ++i) {
//printf("inputs: %d, %d\n", input0[i*2], input0[i*2 + 1]);
//printf("generic... %d, ssse3... %d\n", output0[i], output1[i]);
}
for(int i = 0; i < vlen; ++i) {
//printf("%d...%d\n", output0[i], output01[i]);
CPPUNIT_ASSERT_DOUBLES_EQUAL(output_generic[i], output_sse3[i], fabs(output_generic[i]*1e-4));
}
}
#endif

View File

@ -0,0 +1,18 @@
#ifndef INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
#define INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestCase.h>
class qa_32fc_power_spectral_density_32f_aligned16 : public CppUnit::TestCase {
CPPUNIT_TEST_SUITE (qa_32fc_power_spectral_density_32f_aligned16);
CPPUNIT_TEST (t1);
CPPUNIT_TEST_SUITE_END ();
private:
void t1 ();
};
#endif /* INCLUDED_QA_32FC_POWER_SPECTRAL_DENSITY_32F_ALIGNED16_H */

View File

@ -0,0 +1,720 @@
#include "qa_utils.h"
#include <cstring>
#include <boost/foreach.hpp>
#include <boost/assign/list_of.hpp>
#include <boost/tokenizer.hpp>
#include <boost/xpressive/xpressive.hpp>
#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <list>
#include <ctime>
#include <cmath>
#include <limits>
#include <boost/lexical_cast.hpp>
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <volk_gnsssdr/volk_gnsssdr_cpu.h>
#include <volk_gnsssdr/volk_gnsssdr_common.h>
#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
#include <boost/typeof/typeof.hpp>
#include <boost/type_traits.hpp>
#include <stdio.h>
float uniform() {
return 2.0 * ((float) rand() / RAND_MAX - 0.5); // uniformly (-1, 1)
}
template <class t>
void random_floats (t *buf, unsigned n)
{
for (unsigned i = 0; i < n; i++)
buf[i] = uniform ();
}
void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n) {
if(type.is_complex) n *= 2;
if(type.is_float) {
if(type.size == 8) random_floats<double>((double *)data, n);
else random_floats<float>((float *)data, n);
} else {
float int_max = float(uint64_t(2) << (type.size*8));
if(type.is_signed) int_max /= 2.0;
for(unsigned int i=0; i<n; i++) {
float scaled_rand = (((float) (rand() - (RAND_MAX/2))) / static_cast<float>((RAND_MAX/2))) * int_max;
//man i really don't know how to do this in a more clever way, you have to cast down at some point
switch(type.size) {
case 8:
if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand;
else ((uint64_t *)data)[i] = (uint64_t) scaled_rand;
break;
case 4:
if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand;
else ((uint32_t *)data)[i] = (uint32_t) scaled_rand;
break;
case 2:
if(type.is_signed) ((int16_t *)data)[i] = (int16_t) scaled_rand;
else ((uint16_t *)data)[i] = (uint16_t) scaled_rand;
break;
case 1:
if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand;
else ((uint8_t *)data)[i] = (uint8_t) scaled_rand;
break;
default:
throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
}
}
}
}
static std::vector<std::string> get_arch_list(volk_gnsssdr_func_desc_t desc) {
std::vector<std::string> archlist;
for(size_t i = 0; i < desc.n_impls; i++) {
//if(!(archs[i+1] & volk_gnsssdr_get_lvarch())) continue; //this arch isn't available on this pc
archlist.push_back(std::string(desc.impl_names[i]));
}
return archlist;
}
volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) {
volk_gnsssdr_type_t type;
type.is_float = false;
type.is_scalar = false;
type.is_complex = false;
type.is_signed = false;
type.size = 0;
type.str = name;
if(name.size() < 2) throw std::string("name too short to be a datatype");
//is it a scalar?
if(name[0] == 's') {
type.is_scalar = true;
name = name.substr(1, name.size()-1);
}
//get the data size
size_t last_size_pos = name.find_last_of("0123456789");
if(last_size_pos == std::string::npos)
throw std::string("no size spec in type ").append(name);
//will throw if malformed
int size = boost::lexical_cast<int>(name.substr(0, last_size_pos+1));
assert(((size % 8) == 0) && (size <= 64) && (size != 0));
type.size = size/8; //in bytes
for(size_t i=last_size_pos+1; i < name.size(); i++) {
switch (name[i]) {
case 'f':
type.is_float = true;
break;
case 'i':
type.is_signed = true;
break;
case 'c':
type.is_complex = true;
break;
case 'u':
type.is_signed = false;
break;
default:
throw;
}
}
return type;
}
static void get_signatures_from_name(std::vector<volk_gnsssdr_type_t> &inputsig,
std::vector<volk_gnsssdr_type_t> &outputsig,
std::string name) {
boost::char_separator<char> sep("_");
boost::tokenizer<boost::char_separator<char> > tok(name, sep);
std::vector<std::string> toked;
tok.assign(name);
toked.assign(tok.begin(), tok.end());
assert(toked[0] == "volk");
toked.erase(toked.begin());
toked.erase(toked.begin());
//ok. we're assuming a string in the form
//(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT;
std::string fn_name;
volk_gnsssdr_type_t type;
BOOST_FOREACH(std::string token, toked) {
try {
type = volk_gnsssdr_type_from_string(token);
if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
if(side == SIDE_INPUT) inputsig.push_back(type);
else outputsig.push_back(type);
} catch (...){
if(token[0] == 'x') { //it's a multiplier
if(side == SIDE_INPUT) assert(inputsig.size() > 0);
else assert(outputsig.size() > 0);
int multiplier = boost::lexical_cast<int>(token.substr(1, token.size()-1)); //will throw if invalid
for(int i=1; i<multiplier; i++) {
if(side == SIDE_INPUT) inputsig.push_back(inputsig.back());
else outputsig.push_back(outputsig.back());
}
}
else if(side == SIDE_INPUT) { //it's the function name, at least it better be
side = SIDE_NAME;
fn_name.append("_");
fn_name.append(token);
}
else if(side == SIDE_OUTPUT) {
if(token != toked.back()) throw; //the last token in the name is the alignment
}
}
}
//we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
assert(inputsig.size() != 0);
}
inline void run_cast_test1(volk_gnsssdr_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], vlen, arch.c_str());
}
inline void run_cast_test2(volk_gnsssdr_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
}
inline void run_cast_test3(volk_gnsssdr_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
}
inline void run_cast_test4(volk_gnsssdr_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
}
inline void run_cast_test1_s32f(volk_gnsssdr_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
}
inline void run_cast_test2_s32f(volk_gnsssdr_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
}
inline void run_cast_test3_s32f(volk_gnsssdr_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
}
inline void run_cast_test1_s32fc(volk_gnsssdr_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
}
inline void run_cast_test2_s32fc(volk_gnsssdr_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
}
inline void run_cast_test3_s32fc(volk_gnsssdr_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
}
//ADDED BY GNSS-SDR. START
inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
}
inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
}
inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
}
inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], scalar, vlen, arch.c_str());
}
inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
}
inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
}
inline void run_cast_test8(volk_gnsssdr_fn_8arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], vlen, arch.c_str());
}
inline void run_cast_test8_s8i(volk_gnsssdr_fn_8arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
}
inline void run_cast_test8_s8ic(volk_gnsssdr_fn_8arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
}
inline void run_cast_test8_s32f(volk_gnsssdr_fn_8arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
}
inline void run_cast_test8_s32fc(volk_gnsssdr_fn_8arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], scalar, vlen, arch.c_str());
}
inline void run_cast_test12(volk_gnsssdr_fn_12arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], vlen, arch.c_str());
}
inline void run_cast_test12_s8i(volk_gnsssdr_fn_12arg_s8i func, std::vector<void *> &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
}
inline void run_cast_test12_s8ic(volk_gnsssdr_fn_12arg_s8ic func, std::vector<void *> &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
}
inline void run_cast_test12_s32f(volk_gnsssdr_fn_12arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
}
inline void run_cast_test12_s32fc(volk_gnsssdr_fn_12arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) {
while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], buffs[4], buffs[5], buffs[6], buffs[7], buffs[8], buffs[9], buffs[10], buffs[11], scalar, vlen, arch.c_str());
}
//ADDED BY GNSS-SDR. END
// This function is a nop that helps resolve GNU Radio bugs 582 and 583.
// Without this the cast in run_volk_gnsssdr_tests for tol_i = static_cast<int>(float tol)
// won't happen on armhf (reported on cortex A9 and A15).
void lv_force_cast_hf( int tol_i, float tol_f)
{
int diff_i = 1;
float diff_f = 1;
if( diff_i > tol_i )
std::cout << "" ;
if( diff_f > tol_f )
std::cout << "" ;
}
template <class t>
bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
bool fail = false;
int print_max_errs = 10;
for(unsigned int i=0; i<vlen; i++) {
// for very small numbers we'll see round off errors due to limited
// precision. So a special test case...
if(fabs(((t *)(in1))[i]) < 1e-30) {
if( fabs( ((t *)(in2))[i] ) > tol )
{
fail=true;
if(print_max_errs-- > 0) {
std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
}
}
}
// the primary test is the percent different greater than given tol
else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol) {
fail=true;
if(print_max_errs-- > 0) {
std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]) << std::endl;
}
}
}
return fail;
}
template <class t>
bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) {
bool fail = false;
int print_max_errs = 10;
for(unsigned int i=0; i<2*vlen; i+=2) {
t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] };
t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]);
// for very small numbers we'll see round off errors due to limited
// precision. So a special test case...
if (norm < 1e-30) {
if (err > tol)
{
fail=true;
if(print_max_errs-- > 0) {
std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j" << std::endl;
}
}
}
// the primary test is the percent different greater than given tol
else if((err / norm) > tol) {
fail=true;
if(print_max_errs-- > 0) {
std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j" << std::endl;
}
}
}
return fail;
}
template <class t>
bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
bool fail = false;
int print_max_errs = 10;
for(unsigned int i=0; i<vlen; i++) {
if(abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i])) > tol) {
fail=true;
if(print_max_errs-- > 0) {
std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])) << std::endl;
}
}
}
return fail;
}
class volk_gnsssdr_qa_aligned_mem_pool{
public:
void *get_new(size_t size){
size_t alignment = volk_gnsssdr_get_alignment();
void* ptr = volk_gnsssdr_malloc(size, alignment);
memset(ptr, 0x00, size);
_mems.push_back(ptr);
return ptr;
}
~volk_gnsssdr_qa_aligned_mem_pool() {
for(unsigned int ii = 0; ii < _mems.size(); ++ii) {
volk_gnsssdr_free(_mems[ii]);
}
}
private: std::vector<void * > _mems;
};
bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
void (*manual_func)(),
std::string name,
float tol,
lv_32fc_t scalar,
int vlen,
int iter,
std::vector<volk_gnsssdr_test_results_t> *results,
std::string puppet_master_name,
bool benchmark_mode,
std::string kernel_regex
) {
boost::xpressive::sregex kernel_expression = boost::xpressive::sregex::compile(kernel_regex);
if( !boost::xpressive::regex_search(name, kernel_expression) ) {
// in this case we have a regex and are only looking to test one kernel
return false;
}
if(results) {
results->push_back(volk_gnsssdr_test_results_t());
results->back().name = name;
results->back().vlen = vlen;
results->back().iter = iter;
}
std::cout << "RUN_VOLK_TESTS: " << name << "(" << vlen << "," << iter << ")" << std::endl;
// The multiply and lv_force_cast_hf are work arounds for GNU Radio bugs 582 and 583
// The bug is the casting/assignment below do not happen, which results in false
// positives when testing for errors in fcompare and icompare.
// Since this only happens on armhf (reported for Cortex A9 and A15) combined with
// the following fixes it is suspected to be a compiler bug.
// Bug 1272024 on launchpad has been filed with Linaro GCC.
const float tol_f = tol*1.0000001;
const unsigned int tol_i = static_cast<const unsigned int>(tol);
lv_force_cast_hf( tol_i, tol_f );
//first let's get a list of available architectures for the test
std::vector<std::string> arch_list = get_arch_list(desc);
if((!benchmark_mode) && (arch_list.size() < 2)) {
std::cout << "no architectures to test" << std::endl;
return false;
}
//something that can hang onto memory and cleanup when this function exits
volk_gnsssdr_qa_aligned_mem_pool mem_pool;
//now we have to get a function signature by parsing the name
std::vector<volk_gnsssdr_type_t> inputsig, outputsig;
get_signatures_from_name(inputsig, outputsig, name);
//pull the input scalars into their own vector
std::vector<volk_gnsssdr_type_t> inputsc;
for(size_t i=0; i<inputsig.size(); i++) {
if(inputsig[i].is_scalar) {
inputsc.push_back(inputsig[i]);
inputsig.erase(inputsig.begin() + i);
i -= 1;
}
}
//for(int i=0; i<inputsig.size(); i++) std::cout << "Input: " << inputsig[i].str << std::endl;
//for(int i=0; i<outputsig.size(); i++) std::cout << "Output: " << outputsig[i].str << std::endl;
std::vector<void *> inbuffs;
BOOST_FOREACH(volk_gnsssdr_type_t sig, inputsig) {
if(!sig.is_scalar) //we don't make buffers for scalars
inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1)));
}
for(size_t i=0; i<inbuffs.size(); i++) {
load_random_data(inbuffs[i], inputsig[i], vlen);
}
//ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
std::vector<std::vector<void *> > test_data;
for(size_t i=0; i<arch_list.size(); i++) {
std::vector<void *> arch_buffs;
for(size_t j=0; j<outputsig.size(); j++) {
arch_buffs.push_back(mem_pool.get_new(vlen*outputsig[j].size*(outputsig[j].is_complex ? 2 : 1)));
}
for(size_t j=0; j<inputsig.size(); j++) {
arch_buffs.push_back(inbuffs[j]);
}
test_data.push_back(arch_buffs);
}
std::vector<volk_gnsssdr_type_t> both_sigs;
both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end());
both_sigs.insert(both_sigs.end(), inputsig.begin(), inputsig.end());
//now run the test
clock_t start, end;
std::vector<double> profile_times;
for(size_t i = 0; i < arch_list.size(); i++) {
start = clock();
switch(both_sigs.size()) {
case 1:
if(inputsc.size() == 0) {
run_cast_test1((volk_gnsssdr_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
} else if(inputsc.size() == 1 && inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test1_s32fc((volk_gnsssdr_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
//ADDED BY GNSS-SDR. START
else if(inputsc.size() == 1 && !inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
//ADDED BY GNSS-SDR. END
else throw "unsupported 1 arg function >1 scalars";
break;
case 2:
if(inputsc.size() == 0) {
run_cast_test2((volk_gnsssdr_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
} else if(inputsc.size() == 1 && inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test2_s32fc((volk_gnsssdr_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
//ADDED BY GNSS-SDR. START
else if(inputsc.size() == 1 && !inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
//ADDED BY GNSS-SDR. END
else throw "unsupported 2 arg function >1 scalars";
break;
case 3:
if(inputsc.size() == 0) {
run_cast_test3((volk_gnsssdr_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
} else if(inputsc.size() == 1 && inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test3_s32fc((volk_gnsssdr_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
//ADDED BY GNSS-SDR. START
else if(inputsc.size() == 1 && !inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
//ADDED BY GNSS-SDR. END
else throw "unsupported 3 arg function >1 scalars";
break;
case 4:
run_cast_test4((volk_gnsssdr_fn_4arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
break;
//ADDED BY GNSS-SDR. START
case 8:
if(inputsc.size() == 0) {
run_cast_test8((volk_gnsssdr_fn_8arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
} else if(inputsc.size() == 1 && inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test8_s32fc((volk_gnsssdr_fn_8arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test8_s32f((volk_gnsssdr_fn_8arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
else if(inputsc.size() == 1 && !inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test8_s8ic((volk_gnsssdr_fn_8arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test8_s8i((volk_gnsssdr_fn_8arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
else throw "unsupported 8 arg function >1 scalars";
break;
case 12:
if(inputsc.size() == 0) {
run_cast_test12((volk_gnsssdr_fn_12arg)(manual_func), test_data[i], vlen, iter, arch_list[i]);
} else if(inputsc.size() == 1 && inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test12_s32fc((volk_gnsssdr_fn_12arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test12_s32f((volk_gnsssdr_fn_12arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
else if(inputsc.size() == 1 && !inputsc[0].is_float) {
if(inputsc[0].is_complex) {
run_cast_test12_s8ic((volk_gnsssdr_fn_12arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]);
} else {
run_cast_test12_s8i((volk_gnsssdr_fn_12arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]);
}
}
else throw "unsupported 12 arg function >1 scalars";
break;
//ADDED BY GNSS-SDR. END
default:
throw "no function handler for this signature";
break;
}
end = clock();
double arch_time = 1000.0 * (double)(end-start)/(double)CLOCKS_PER_SEC;
std::cout << arch_list[i] << " completed in " << arch_time << "ms" << std::endl;
if(results) {
volk_gnsssdr_test_time_t result;
result.name = arch_list[i];
result.time = arch_time;
result.units = "ms";
results->back().results[result.name] = result;
}
profile_times.push_back(arch_time);
}
//and now compare each output to the generic output
//first we have to know which output is the generic one, they aren't in order...
size_t generic_offset=0;
for(size_t i=0; i<arch_list.size(); i++)
if(arch_list[i] == "generic") generic_offset=i;
//now compare
//if(outputsig.size() == 0) outputsig = inputsig; //a hack, i know
bool fail = false;
bool fail_global = false;
std::vector<bool> arch_results;
for(size_t i=0; i<arch_list.size(); i++) {
fail = false;
if(i != generic_offset) {
for(size_t j=0; j<both_sigs.size(); j++) {
if(both_sigs[j].is_float) {
if(both_sigs[j].size == 8) {
if (both_sigs[j].is_complex) {
fail = ccompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f);
} else {
fail = fcompare((double *) test_data[generic_offset][j], (double *) test_data[i][j], vlen, tol_f);
}
} else {
if (both_sigs[j].is_complex) {
fail = ccompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f);
} else {
fail = fcompare((float *) test_data[generic_offset][j], (float *) test_data[i][j], vlen, tol_f);
}
}
} else {
//i could replace this whole switch statement with a memcmp if i wasn't interested in printing the outputs where they differ
switch(both_sigs[j].size) {
case 8:
if(both_sigs[j].is_signed) {
fail = icompare((int64_t *) test_data[generic_offset][j], (int64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
} else {
fail = icompare((uint64_t *) test_data[generic_offset][j], (uint64_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
}
break;
case 4:
if(both_sigs[j].is_signed) {
fail = icompare((int32_t *) test_data[generic_offset][j], (int32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
} else {
fail = icompare((uint32_t *) test_data[generic_offset][j], (uint32_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
}
break;
case 2:
if(both_sigs[j].is_signed) {
fail = icompare((int16_t *) test_data[generic_offset][j], (int16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
} else {
fail = icompare((uint16_t *) test_data[generic_offset][j], (uint16_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
}
break;
case 1:
if(both_sigs[j].is_signed) {
fail = icompare((int8_t *) test_data[generic_offset][j], (int8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
} else {
fail = icompare((uint8_t *) test_data[generic_offset][j], (uint8_t *) test_data[i][j], vlen*(both_sigs[j].is_complex ? 2 : 1), tol_i);
}
break;
default:
fail=1;
}
}
if(fail) {
fail_global = true;
std::cout << name << ": fail on arch " << arch_list[i] << std::endl;
}
//fail = memcmp(outbuffs[generic_offset], outbuffs[i], outputsig[0].size * vlen * (outputsig[0].is_complex ? 2:1));
}
}
arch_results.push_back(!fail);
}
double best_time_a = std::numeric_limits<double>::max();
double best_time_u = std::numeric_limits<double>::max();
std::string best_arch_a = "generic";
std::string best_arch_u = "generic";
for(size_t i=0; i < arch_list.size(); i++)
{
if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0)
{
best_time_u = profile_times[i];
best_arch_u = arch_list[i];
}
if((profile_times[i] < best_time_a) && arch_results[i])
{
best_time_a = profile_times[i];
best_arch_a = arch_list[i];
}
}
std::cout << "Best aligned arch: " << best_arch_a << std::endl;
std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
if(results) {
if(puppet_master_name == "NULL") {
results->back().config_name = name;
} else {
results->back().config_name = puppet_master_name;
}
results->back().best_arch_a = best_arch_a;
results->back().best_arch_u = best_arch_u;
}
return fail_global;
}

View File

@ -0,0 +1,102 @@
#ifndef VOLK_QA_UTILS_H
#define VOLK_QA_UTILS_H
#include <cstdlib>
#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include <map>
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <volk_gnsssdr/volk_gnsssdr_common.h>
struct volk_gnsssdr_type_t {
bool is_float;
bool is_scalar;
bool is_signed;
bool is_complex;
int size;
std::string str;
};
volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string);
float uniform(void);
void random_floats(float *buf, unsigned n);
class volk_gnsssdr_test_time_t {
public:
std::string name;
double time;
std::string units;
};
class volk_gnsssdr_test_results_t {
public:
std::string name;
std::string config_name;
int vlen;
int iter;
std::map<std::string, volk_gnsssdr_test_time_t> results;
std::string best_arch_a;
std::string best_arch_u;
};
bool run_volk_gnsssdr_tests(
volk_gnsssdr_func_desc_t,
void(*)(),
std::string,
float,
lv_32fc_t,
int,
int,
std::vector<volk_gnsssdr_test_results_t> *results = NULL,
std::string puppet_master_name = "NULL",
bool benchmark_mode=false,
std::string kernel_regex=""
);
#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
BOOST_AUTO_TEST_CASE(func##_test) { \
BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
func##_get_func_desc(), (void (*)())func##_manual, \
std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
0); \
}
#define VOLK_PROFILE(func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, "NULL", bnmode, kernel_regex)
#define VOLK_PUPPET_PROFILE(func, puppet_master_func, tol, scalar, len, iter, results, bnmode, kernel_regex) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), tol, scalar, len, iter, results, std::string(#puppet_master_func), bnmode, kernel_regex)
typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
//ADDED BY GNSS-SDR. START
typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input
typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input
typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_8arg)(void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_8arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_8arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_8arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_8arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_12arg)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_12arg_s32f)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, float, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_12arg_s32fc)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_32fc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_12arg_s8i)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, char, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_12arg_s8ic)(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, lv_8sc_t, unsigned int, const char*);
//ADDED BY GNSS-SDR. END
#endif //VOLK_QA_UTILS_H

View File

@ -0,0 +1,90 @@
/* -*- c++ -*- */
/*
* Copyright 2012-2014 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#include "qa_utils.h"
#include <volk_gnsssdr/volk_gnsssdr.h>
#include <boost/test/unit_test.hpp>
//VOLK PROTOKERNELS OBTAINED FROM THE GNURADIO BASE
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_multiply_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_dot_prod_32fc, 1e-4, 0, 204603, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32fc_multiply_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_conjugate_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_x2_add_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_index_max_16u, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_accumulator_s32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_magnitude_squared_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_convert_16i, 3, 0, 20462, 1);
//GNSS-SDR PROTO-KERNELS
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_multiply_8ic, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8u_x2_multiply_8u, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x2_dot_prod_8ic, 1e-4, 0, 204603, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_s8ic_multiply_8ic, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_conjugate_8ic, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_x2_add_8i, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_index_max_16u, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_accumulator_s8i, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_magnitude_squared_8i, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8i_max_s8i, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_64f_accumulator_64f, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_16ic, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_convert_8ic, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_convert_8ic, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_16i_s32f_convert_32f, 3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_8ic_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_16ic_x5_cw_epl_corr_TEST_32fc_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x5_cw_epl_corr_32fc_x3, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_16ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_safe_32fc_x5, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_unsafe_32fc_x5, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_32fc_x5, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_8ic_x7_cw_vepl_corr_TEST_32fc_x5, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x4_update_local_code_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_gnsssdr_s32f_x2_update_local_carrier_32fc, 1e-4, 0, 20462, 1);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_x5_add_quad_16i_x4, 1e-4, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_branch_4_state_8, 1e-4, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_16i, 0, 0, 20462, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_max_star_horizontal_16i, 0, 0, 20462, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_permute_and_scalar_add, 1e-4, 0, 2046, 1000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_x4_quad_max_star_16i, 1e-4, 0, 2046, 1000);
//VOLK_RUN_TESTS(volk_gnsssdr_16i_32fc_dot_prod_32fc, 1e-4, 0, 204602, 1);
//VOLK_RUN_TESTS(volk_gnsssdr_32fc_x2_conjugate_dot_prod_32fc, 1e-4, 0, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_32fc_s32f_x2_power_spectral_density_32f, 1e-4, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_32f_s32f_32f_fm_detect_32f, 1e-4, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_32u_popcnt, 0, 0, 2046, 10000);
//VOLK_RUN_TESTS(volk_gnsssdr_64u_popcnt, 0, 0, 2046, 10000);

View File

@ -0,0 +1,142 @@
/* -*- c -*- */
/*
* Copyright 2014 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#include <pthread.h>
#include <volk_gnsssdr/volk_gnsssdr_malloc.h>
#include <stdio.h>
#include <stdlib.h>
/*
* For #defines used to determine support for allocation functions,
* see: http://linux.die.net/man/3/aligned_alloc
*/
// Disabling use of aligned_alloc. This function requires that size be
// a multiple of alignment, which is too restrictive for many uses of
// VOLK.
//// If we are using C11 standard, use the aligned_alloc
//#ifdef _ISOC11_SOURCE
//
//void *volk_gnsssdr_malloc(size_t size, size_t alignment)
//{
// void *ptr = aligned_alloc(alignment, size);
// if(ptr == NULL) {
// fprintf(stderr, "VOLK: Error allocating memory (aligned_alloc)\n");
// }
// return ptr;
//}
//
//void volk_gnsssdr_free(void *ptr)
//{
// free(ptr);
//}
//
//#else // _ISOC11_SOURCE
// Otherwise, test if we are a POSIX or X/Open system
// This only has a restriction that alignment be a power of 2.
#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
void *volk_gnsssdr_malloc(size_t size, size_t alignment)
{
void *ptr;
int err = posix_memalign(&ptr, alignment, size);
if(err == 0) {
return ptr;
}
else {
fprintf(stderr, "VOLK: Error allocating memory (posix_memalign: %d)\n", err);
return NULL;
}
}
void volk_gnsssdr_free(void *ptr)
{
free(ptr);
}
// _aligned_malloc has no restriction on size,
// available on Windows since Visual C++ 2005
#elif _MSC_VER >= 1400
void *volk_gnsssdr_malloc(size_t size, size_t alignment)
{
void *ptr = _aligned_malloc(size, alignment);
if(ptr == NULL) {
fprintf(stderr, "VOLK: Error allocating memory (_aligned_malloc)\n");
}
return ptr;
}
void volk_gnsssdr_free(void *ptr)
{
_aligned_free(ptr);
}
// No standard handlers; we'll do it ourselves.
#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
struct block_info
{
void *real;
};
void *
volk_gnsssdr_malloc(size_t size, size_t alignment)
{
void *real, *user;
struct block_info *info;
/* At least align to sizeof our struct */
if (alignment < sizeof(struct block_info))
alignment = sizeof(struct block_info);
/* Alloc */
real = malloc(size + (2 * alignment - 1));
/* Get pointer to the various zones */
user = (void *)((((uintptr_t) real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1));
info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info));
/* Store the info for the free */
info->real = real;
/* Return pointer to user */
return user;
}
void
volk_gnsssdr_free(void *ptr)
{
struct block_info *info;
/* Get the real pointer */
info = (struct block_info *)(((uintptr_t)ptr) - sizeof(struct block_info));
/* Release real pointer */
free(info->real);
}
#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
//#endif // _ISOC11_SOURCE

View File

@ -0,0 +1,50 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <volk_gnsssdr/volk_gnsssdr_prefs.h>
//#if defined(_WIN32)
//#include <Windows.h>
//#endif
void volk_gnsssdr_get_config_path(char *path)
{
const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config";
char *home = NULL;
if (home == NULL) home = getenv("HOME");
if (home == NULL) home = getenv("APPDATA");
if (home == NULL){
path = NULL;
return;
}
strcpy(path, home);
strcat(path, suffix);
}
size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **prefs_res)
{
FILE *config_file;
char path[512], line[512];
size_t n_arch_prefs = 0;
volk_gnsssdr_arch_pref_t *prefs = NULL;
//get the config path
volk_gnsssdr_get_config_path(path);
if (path == NULL) return n_arch_prefs; //no prefs found
config_file = fopen(path, "r");
if(!config_file) return n_arch_prefs; //no prefs found
//reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs
while(fgets(line, sizeof(line), config_file) != NULL)
{
prefs = (volk_gnsssdr_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs));
volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs;
if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5))
{
n_arch_prefs++;
}
}
fclose(config_file);
*prefs_res = prefs;
return n_arch_prefs;
}

View File

@ -0,0 +1,119 @@
/*
* Copyright 2011-2012 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#include <volk_gnsssdr_rank_archs.h>
#include <volk_gnsssdr/volk_gnsssdr_prefs.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if __GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ >= 4
#define __popcnt __builtin_popcount
#else
inline unsigned __popcnt(unsigned num)
{
unsigned pop = 0;
while(num)
{
if (num & 0x1) pop++;
num >>= 1;
}
return pop;
}
#endif
int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find
){
unsigned int i;
for (i = 0; i < n_impls; i++) {
if(!strncmp(impl_names[i], impl_name, 20)) {
return i;
}
}
//TODO return -1;
//something terrible should happen here
printf("Volk warning: no arch found, returning generic impl\n");
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
}
int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name
const int* impl_deps, //requirement mask per implementation
const bool* alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations
){
size_t i;
static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs;
static size_t n_arch_prefs = 0;
static int prefs_loaded = 0;
if(!prefs_loaded) {
n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs);
prefs_loaded = 1;
}
// If we've defined VOLK_GENERIC to be anything, always return the
// 'generic' kernel. Used in GR's QA code.
char *gen_env = getenv("VOLK_GENERIC");
if(gen_env) {
return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
}
//now look for the function name in the prefs list
for(i = 0; i < n_arch_prefs; i++)
{
if(!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it
{
const char *impl_name = align? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u;
return volk_gnsssdr_get_index(impl_names, n_impls, impl_name);
}
}
//return the best index with the largest deps
size_t best_index_a = 0;
size_t best_index_u = 0;
int best_value_a = -1;
int best_value_u = -1;
for(i = 0; i < n_impls; i++)
{
const signed val = __popcnt(impl_deps[i]);
if (alignment[i] && val > best_value_a)
{
best_index_a = i;
best_value_a = val;
}
if (!alignment[i] && val > best_value_u)
{
best_index_u = i;
best_value_u = val;
}
}
//when align and we found a best aligned, use it
if (align && best_value_a != -1) return best_index_a;
//otherwise return the best unaligned
return best_index_u;
}

View File

@ -0,0 +1,50 @@
/*
* Copyright 2011-2012 Free Software Foundation, Inc.
*
* This file is part of GNU Radio
*
* GNU Radio is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3, or (at your option)
* any later version.
*
* GNU Radio is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Radio; see the file COPYING. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street,
* Boston, MA 02110-1301, USA.
*/
#ifndef INCLUDED_VOLK_RANK_ARCHS_H
#define INCLUDED_VOLK_RANK_ARCHS_H
#include <stdlib.h>
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
#endif
int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find
);
int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name
const int* impl_deps, //requirement mask per implementation
const bool* alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations
);
#ifdef __cplusplus
}
#endif
#endif /*INCLUDED_VOLK_RANK_ARCHS_H*/

View File

@ -0,0 +1,25 @@
.function volk_gnsssdr_16ic_magnitude_32f_a_orc_impl
.source 4 src
.dest 4 dst
.floatparam 4 scalar
.temp 4 reall
.temp 4 imagl
.temp 2 reals
.temp 2 imags
.temp 4 realf
.temp 4 imagf
.temp 4 sumf
splitlw reals, imags, src
convswl reall, reals
convswl imagl, imags
convlf realf, reall
convlf imagf, imagl
divf realf, realf, scalar
divf imagf, imagf, scalar
mulf realf, realf, realf
mulf imagf, imagf, imagf
addf sumf, realf, imagf
sqrtf dst, sumf

View File

@ -0,0 +1,5 @@
.function volk_gnsssdr_32f_x2_add_32f_a_orc_impl
.dest 4 dst
.source 4 src1
.source 4 src2
addf dst, src1, src2

View File

@ -0,0 +1,18 @@
.function volk_gnsssdr_32fc_s32fc_multiply_32fc_a_orc_impl
.source 8 src1
.floatparam 8 scalar
.dest 8 dst
.temp 8 iqprod
.temp 4 real
.temp 4 imag
.temp 4 ac
.temp 4 bd
.temp 8 swapped
x2 mulf iqprod, src1, scalar
splitql bd, ac, iqprod
subf real, ac, bd
swaplq swapped, src1
x2 mulf iqprod, swapped, scalar
splitql bd, ac, iqprod
addf imag, ac, bd
mergelq dst, real, imag

View File

@ -0,0 +1,18 @@
.function volk_gnsssdr_32fc_x2_multiply_32fc_a_orc_impl
.source 8 src1
.source 8 src2
.dest 8 dst
.temp 8 iqprod
.temp 4 real
.temp 4 imag
.temp 4 ac
.temp 4 bd
.temp 8 swapped
x2 mulf iqprod, src1, src2
splitql bd, ac, iqprod
subf real, ac, bd
swaplq swapped, src1
x2 mulf iqprod, swapped, src2
splitql bd, ac, iqprod
addf imag, ac, bd
mergelq dst, real, imag

View File

@ -0,0 +1,40 @@
#/*!
# * \file volk_gnsssdr_8i_accumulator_s8i.orc
# * \brief ORC implementation: 8 bits (char) scalar accumulator
# * \authors <ul>
# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
# * </ul>
# *
# * ORC code that implements an accumulator of char values
# *
# * -------------------------------------------------------------------------
# *
# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
# *
# * GNSS-SDR is a software defined Global Navigation
# * Satellite Systems receiver
# *
# * This file is part of GNSS-SDR.
# *
# * GNSS-SDR is free software: you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation, either version 3 of the License, or
# * at your option) any later version.
# *
# * GNSS-SDR is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
# *
# * -------------------------------------------------------------------------
# */
.function volk_gnsssdr_8i_accumulator_s8i_a_orc_impl
.source 1 src1
.accumulator 2 acc
.temp 2 sum
mergebw sum, 0, src1
accw acc, sum

View File

@ -0,0 +1,39 @@
#/*!
# * \file volk_gnsssdr_8i_x2_add_8i.orc
# * \brief ORC implementation: adds pairs of 8 bits (char) scalars
# * \authors <ul>
# * <li> Andrés Cecilia, 2014. a.cecilia.luque(at)gmail.com
# * </ul>
# *
# * ORC code that adds pairs of 8 bits (char) scalars
# *
# * -------------------------------------------------------------------------
# *
# * Copyright (C) 2010-2014 (see AUTHORS file for a list of contributors)
# *
# * GNSS-SDR is a software defined Global Navigation
# * Satellite Systems receiver
# *
# * This file is part of GNSS-SDR.
# *
# * GNSS-SDR is free software: you can redistribute it and/or modify
# * it under the terms of the GNU General Public License as published by
# * the Free Software Foundation, either version 3 of the License, or
# * at your option) any later version.
# *
# * GNSS-SDR is distributed in the hope that it will be useful,
# * but WITHOUT ANY WARRANTY; without even the implied warranty of
# * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# * GNU General Public License for more details.
# *
# * You should have received a copy of the GNU General Public License
# * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
# *
# * -------------------------------------------------------------------------
# */
.function volk_gnsssdr_8i_x2_add_8i_a_orc_impl
.dest 1 dst
.source 1 src1
.source 1 src2
addb dst, src1, src2

Some files were not shown because too many files have changed in this diff Show More