From 891478cf2c2097010571e55ad9b3c588fd68caff Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Sat, 3 Mar 2018 12:09:45 +0100 Subject: [PATCH] Apply automated code formatting to volk-gnsssdr See http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting --- .../apps/volk_gnsssdr-config-info.cc | 52 +- .../apps/volk_gnsssdr_option_helpers.cc | 269 ++- .../apps/volk_gnsssdr_option_helpers.h | 9 +- .../volk_gnsssdr/apps/volk_gnsssdr_profile.cc | 378 +-- .../volk_gnsssdr/apps/volk_gnsssdr_profile.h | 8 +- .../volk_gnsssdr/saturation_arithmetic.h | 4 +- .../volk_gnsssdr_avx_intrinsics.h | 51 +- .../volk_gnsssdr/volk_gnsssdr_common.h | 140 +- .../volk_gnsssdr/volk_gnsssdr_complex.h | 36 +- .../volk_gnsssdr_neon_intrinsics.h | 28 +- .../include/volk_gnsssdr/volk_gnsssdr_prefs.h | 6 +- .../volk_gnsssdr/volk_gnsssdr_sine_table.h | 2048 ++++++++--------- .../volk_gnsssdr_sse3_intrinsics.h | 34 +- .../volk_gnsssdr_sse_intrinsics.h | 24 +- .../volk_gnsssdr_16i_resamplerxnpuppet_16i.h | 181 +- .../volk_gnsssdr_16i_xn_resampler_16i_xn.h | 77 +- ...nsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h | 1610 ++++++------- ...dr_16ic_16i_rotator_dotprodxnpuppet_16ic.h | 261 ++- .../volk_gnsssdr_16ic_conjugate_16ic.h | 3 +- .../volk_gnsssdr_16ic_convert_32fc.h | 24 +- .../volk_gnsssdr_16ic_resampler_fast_16ic.h | 169 +- ...lk_gnsssdr_16ic_resamplerfastpuppet_16ic.h | 10 +- ..._gnsssdr_16ic_resamplerfastxnpuppet_16ic.h | 88 +- ...volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h | 180 +- .../volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h | 562 ++--- .../volk_gnsssdr_16ic_x2_dot_prod_16ic.h | 115 +- .../volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h | 215 +- ...olk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h | 192 +- .../volk_gnsssdr_16ic_x2_multiply_16ic.h | 111 +- ...gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h | 914 ++++---- ...sdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h | 90 +- .../volk_gnsssdr_16ic_xn_resampler_16ic_xn.h | 77 +- ...k_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h | 185 +- .../volk_gnsssdr_32f_index_max_32u.h | 202 +- .../volk_gnsssdr_32f_resamplerxnpuppet_32f.h | 180 +- .../volk_gnsssdr_32f_sincos_32fc.h | 200 +- .../volk_gnsssdr_32f_xn_resampler_32f_xn.h | 94 +- ...nsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h | 207 +- ...dr_32fc_32f_rotator_dotprodxnpuppet_32fc.h | 49 +- .../volk_gnsssdr_32fc_convert_16ic.h | 112 +- .../volk_gnsssdr_32fc_convert_8ic.h | 106 +- ...volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h | 223 +- ...gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h | 148 +- ...sdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h | 56 +- .../volk_gnsssdr_32fc_xn_resampler_32fc_xn.h | 114 +- .../volk_gnsssdr_64f_accumulator_64f.h | 46 +- .../volk_gnsssdr_8i_accumulator_s8i.h | 42 +- .../volk_gnsssdr_8i_index_max_16u.h | 146 +- .../volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h | 108 +- .../volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h | 26 +- .../volk_gnsssdr_8ic_conjugate_8ic.h | 13 +- .../volk_gnsssdr_8ic_magnitude_squared_8i.h | 48 +- .../volk_gnsssdr_8ic_s8ic_multiply_8ic.h | 6 +- .../volk_gnsssdr_8ic_x2_dot_prod_8ic.h | 79 +- .../volk_gnsssdr_8ic_x2_multiply_8ic.h | 12 +- .../volk_gnsssdr_8u_x2_multiply_8u.h | 16 +- .../volk_gnsssdr_s32f_sincos_32fc.h | 446 ++-- .../volk_gnsssdr_s32f_sincospuppet_32fc.h | 16 +- .../volk_gnsssdr/lib/kernel_tests.h | 26 +- .../volk_gnsssdr/lib/qa_utils.cc | 1012 ++++---- .../volk_gnsssdr/lib/qa_utils.h | 226 +- .../volk_gnsssdr/lib/testqa.cc | 112 +- .../volk_gnsssdr/lib/volk_gnsssdr_malloc.c | 18 +- .../volk_gnsssdr/lib/volk_gnsssdr_prefs.c | 23 +- .../lib/volk_gnsssdr_rank_archs.c | 38 +- .../lib/volk_gnsssdr_rank_archs.h | 29 +- .../volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c | 193 +- .../volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h | 14 +- .../tmpl/volk_gnsssdr_config_fixed.tmpl.h | 3 +- .../volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c | 165 +- .../volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h | 9 +- .../tmpl/volk_gnsssdr_machine_xxx.tmpl.c | 10 +- .../tmpl/volk_gnsssdr_machines.tmpl.c | 4 +- .../tmpl/volk_gnsssdr_machines.tmpl.h | 32 +- .../tmpl/volk_gnsssdr_typedefs.tmpl.h | 2 +- 75 files changed, 6642 insertions(+), 6120 deletions(-) mode change 100755 => 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc index 3a2c7c39f..60c421be3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc @@ -20,30 +20,30 @@ #include #endif -#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine -#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t -#include // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ... -#include // for operator<<, endl, cout, ostream -#include // for string +#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine +#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t +#include // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ... +#include // for operator<<, endl, cout, ostream +#include // for string void print_alignment() { - std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl; + std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl; } void print_malloc() { - // You don't want to change the volk_malloc code, so just copy the if/else - // structure from there and give an explanation for the implementations - std::cout << "Used malloc implementation: "; - #if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN - std::cout << "posix_memalign" << std::endl; - #elif _MSC_VER >= 1400 - std::cout << "aligned_malloc" << std::endl; - #else - std::cout << "No standard handler available, using own implementation." << std::endl; - #endif + // You don't want to change the volk_malloc code, so just copy the if/else + // structure from there and give an explanation for the implementations + std::cout << "Used malloc implementation: "; +#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN + std::cout << "posix_memalign" << std::endl; +#elif _MSC_VER >= 1400 + std::cout << "aligned_malloc" << std::endl; +#else + std::cout << "No standard handler available, using own implementation." << std::endl; +#endif } @@ -54,22 +54,24 @@ int main(int argc, char **argv) our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler())); our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags())); our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines())); - our_options.add(option_t("avail-machines", "", "print VOLK_GNSSSDR machines on the current " - "platform", volk_gnsssdr_list_machines)); + our_options.add(option_t("avail-machines", "", + "print VOLK_GNSSSDR machines on the current " + "platform", + volk_gnsssdr_list_machines)); our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used", - volk_gnsssdr_get_machine())); + volk_gnsssdr_get_machine())); our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment)); our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_gnsssdr_malloc", - print_malloc)); + print_malloc)); our_options.add(option_t("version", "v", "print the VOLK_GNSSSDR version", volk_gnsssdr_version())); try - { + { our_options.parse(argc, argv); - } - catch(...) - { + } + catch (...) + { return 1; - } + } return 0; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.cc index 61e085423..a6a263a20 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.cc @@ -17,157 +17,182 @@ */ #include "volk_gnsssdr_option_helpers.h" -#include // IWYU pragma: keep -#include // IWYU pragma: keep -#include // IWYU pragma: keep -#include // for exception -#include // for operator<<, endl, basic_ostream, cout, ostream -#include // for pair - +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // for exception +#include // for operator<<, endl, basic_ostream, cout, ostream +#include // for pair /* * Option type */ option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback(callback) { option_type = VOID_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback(callback) { option_type = VOID_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = INT_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) { option_type = INT_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) { option_type = FLOAT_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = BOOL_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) { option_type = BOOL_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = STRING_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) { option_type = STRING_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - printval(printval) { option_type = STRING; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + printval(printval) { option_type = STRING; } /* * Option List */ -option_list::option_list(std::string program_name) : - program_name(program_name) { - { internal_list = std::vector(); } -} - -void option_list::add(const option_t & opt) { internal_list.push_back(opt); } - -void option_list::parse(int argc, char **argv) { - for (int arg_number = 0; arg_number < argc; ++arg_number) { - for (std::vector::iterator this_option = internal_list.begin(); - this_option != internal_list.end(); - this_option++) { - if (this_option->longform == std::string(argv[arg_number]) || - this_option->shortform == std::string(argv[arg_number])) { - switch (this_option->option_type) { - case VOID_CALLBACK: - this_option->callback(); - break; - case INT_CALLBACK: - try { - int int_val = std::stoi(argv[++arg_number]); - ((void (*)(int)) this_option->callback)(int_val); - } catch (std::exception &exc) { - std::cout << "An int option can only receive a number" << std::endl; - throw std::exception(); - }; - break; - case FLOAT_CALLBACK: - try { - int int_val = std::stof(argv[++arg_number]); - ((void (*)(float)) this_option->callback)(int_val); - } catch (std::exception &exc) { - std::cout << "A float option can only receive a number" << std::endl; - throw std::exception(); - }; - break; - case BOOL_CALLBACK: - try { - bool int_val = (bool) std::stoi(argv[++arg_number]); - ((void (*)(bool)) this_option->callback)(int_val); - } catch (std::exception &exc) { - std::cout << "A bool option can only receive 0 or 1" << std::endl; - throw std::exception(); - }; - break; - case STRING_CALLBACK: - try { - ((void (*)(std::string)) this_option->callback)(argv[++arg_number]); - } catch (std::exception &exc) { - throw std::exception(); - }; - break; - case STRING: - std::cout << this_option->printval << std::endl; - break; - default: - this_option->callback(); - break; - } - } - } - if (std::string("--help") == std::string(argv[arg_number]) || - std::string("-h") == std::string(argv[arg_number])) { - help(); - } +option_list::option_list(std::string program_name) : program_name(program_name) +{ + { + internal_list = std::vector(); } } -void option_list::help() { +void option_list::add(const option_t &opt) { internal_list.push_back(opt); } + +void option_list::parse(int argc, char **argv) +{ + for (int arg_number = 0; arg_number < argc; ++arg_number) + { + for (std::vector::iterator this_option = internal_list.begin(); + this_option != internal_list.end(); + this_option++) + { + if (this_option->longform == std::string(argv[arg_number]) || + this_option->shortform == std::string(argv[arg_number])) + { + switch (this_option->option_type) + { + case VOID_CALLBACK: + this_option->callback(); + break; + case INT_CALLBACK: + try + { + int int_val = std::stoi(argv[++arg_number]); + ((void (*)(int))this_option->callback)(int_val); + } + catch (std::exception &exc) + { + std::cout << "An int option can only receive a number" << std::endl; + throw std::exception(); + }; + break; + case FLOAT_CALLBACK: + try + { + int int_val = std::stof(argv[++arg_number]); + ((void (*)(float))this_option->callback)(int_val); + } + catch (std::exception &exc) + { + std::cout << "A float option can only receive a number" << std::endl; + throw std::exception(); + }; + break; + case BOOL_CALLBACK: + try + { + bool int_val = (bool)std::stoi(argv[++arg_number]); + ((void (*)(bool))this_option->callback)(int_val); + } + catch (std::exception &exc) + { + std::cout << "A bool option can only receive 0 or 1" << std::endl; + throw std::exception(); + }; + break; + case STRING_CALLBACK: + try + { + ((void (*)(std::string))this_option->callback)(argv[++arg_number]); + } + catch (std::exception &exc) + { + throw std::exception(); + }; + break; + case STRING: + std::cout << this_option->printval << std::endl; + break; + default: + this_option->callback(); + break; + } + } + } + if (std::string("--help") == std::string(argv[arg_number]) || + std::string("-h") == std::string(argv[arg_number])) + { + help(); + } + } +} + +void option_list::help() +{ std::cout << program_name << std::endl; std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl; for (std::vector::iterator this_option = internal_list.begin(); this_option != internal_list.end(); - this_option++) { - std::string help_line(" "); - if (this_option->shortform == "-") { - help_line += this_option->longform + " "; - } else { - help_line += this_option->shortform + " [ " + this_option->longform + " ]"; - } + this_option++) + { + std::string help_line(" "); + if (this_option->shortform == "-") + { + help_line += this_option->longform + " "; + } + else + { + help_line += this_option->shortform + " [ " + this_option->longform + " ]"; + } - switch (help_line.size() / 8) { - case 0: - help_line += "\t\t\t\t"; - break; - case 1: - help_line += "\t\t\t"; - break; - case 2: - help_line += "\t\t"; - break; - case 3: - help_line += "\t"; - break; - default: - break; + switch (help_line.size() / 8) + { + case 0: + help_line += "\t\t\t\t"; + break; + case 1: + help_line += "\t\t\t"; + break; + case 2: + help_line += "\t\t"; + break; + case 3: + help_line += "\t"; + break; + default: + break; + } + help_line += this_option->msg; + std::cout << help_line << std::endl; } - help_line += this_option->msg; - std::cout << help_line << std::endl; - } } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.h index 30cb98210..da1e12821 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.h @@ -36,7 +36,8 @@ typedef enum STRING, } VOLK_OPTYPE; -class option_t { +class option_t +{ public: option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()); option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)); @@ -51,7 +52,6 @@ public: VOLK_OPTYPE option_type; std::string printval; void (*callback)(); - }; class option_list @@ -59,15 +59,16 @@ class option_list public: option_list(std::string program_name); - void add(const option_t & opt); + void add(const option_t &opt); void parse(int argc, char **argv); void help(); + private: std::string program_name; std::vector internal_list; }; -#endif //VOLK_VOLK_OPTION_HELPERS_H +#endif //VOLK_VOLK_OPTION_HELPERS_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.cc index 5b9a1a653..f59c0cb60 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.cc @@ -16,23 +16,22 @@ * along with GNSS-SDR. If not, see . */ -#include "kernel_tests.h" // for init_test_list -#include "qa_utils.h" // for volk_gnsssdr_test_results_t -#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t -#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t +#include "kernel_tests.h" // for init_test_list +#include "qa_utils.h" // for volk_gnsssdr_test_results_t +#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t +#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t #include "volk_gnsssdr_profile.h" -#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path -#include // for create_directories, exists -#include // for path, operator<< -#include // for filesystem -#include // for stat -#include // for size_t -#include // for operator<<, basic_ostream -#include // IWYU pragma: keep -#include // for map, map<>::iterator -#include // for pair -#include // for vector, vector<>::const_.. - +#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path +#include // for create_directories, exists +#include // for path, operator<< +#include // for filesystem +#include // for stat +#include // for size_t +#include // for operator<<, basic_ostream +#include // IWYU pragma: keep +#include // for map, map<>::iterator +#include // for pair +#include // for vector, vector<>::const_.. namespace fs = boost::filesystem; @@ -67,92 +66,112 @@ int main(int argc, char *argv[]) profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config))); try - { + { profile_options.parse(argc, argv); - } - catch(...) - { - return 1; - } + } + catch (...) + { + return 1; + } - for (int arg_number = 0; arg_number < argc; ++arg_number) { + for (int arg_number = 0; arg_number < argc; ++arg_number) + { if (std::string("--help") == std::string(argv[arg_number]) || - std::string("-h") == std::string(argv[arg_number])) { + std::string("-h") == std::string(argv[arg_number])) + { return 0; - } - } + } + } // Adding program options std::ofstream json_file; std::string config_file; - if ( json_filename != "" ) { - json_file.open( json_filename.c_str() ); - } + if (json_filename != "") + { + json_file.open(json_filename.c_str()); + } - if ( volk_config_path != "" ) { - config_file = volk_config_path + "/volk_config"; - } + if (volk_config_path != "") + { + config_file = volk_config_path + "/volk_config"; + } // Run tests std::vector results; - if(update_mode) { - if( config_file != "" ) read_results(&results, config_file); - else read_results(&results); - } + if (update_mode) + { + if (config_file != "") + read_results(&results, config_file); + else + read_results(&results); + } // Initialize the list of tests std::vector test_cases = init_test_list(test_params); // Iterate through list of tests running each one std::string substr_to_match(test_params.kernel_regex()); - for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { - bool regex_match = true; + for (unsigned int ii = 0; ii < test_cases.size(); ++ii) + { + bool regex_match = true; - volk_gnsssdr_test_case_t test_case = test_cases[ii]; - // if the kernel name matches regex then do the test - std::string test_case_name = test_case.name(); - if(test_case_name.find(substr_to_match) == std::string::npos) { - regex_match = false; - } - - // if we are in update mode check if we've already got results - // if we have any, then no need to test that kernel - bool update = true; - if(update_mode) { - for(unsigned int jj=0; jj < results.size(); ++jj) { - if(results[jj].name == test_case.name() || - results[jj].name == test_case.puppet_master_name()) { - update = false; - break; + volk_gnsssdr_test_case_t test_case = test_cases[ii]; + // if the kernel name matches regex then do the test + std::string test_case_name = test_case.name(); + if (test_case_name.find(substr_to_match) == std::string::npos) + { + regex_match = false; } - } - } - if( regex_match && update ) { - try { - run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), - test_case.test_parameters(), &results, test_case.puppet_master_name()); - } - catch (std::string &error) { - std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl; - } + // if we are in update mode check if we've already got results + // if we have any, then no need to test that kernel + bool update = true; + if (update_mode) + { + for (unsigned int jj = 0; jj < results.size(); ++jj) + { + if (results[jj].name == test_case.name() || + results[jj].name == test_case.puppet_master_name()) + { + update = false; + break; + } + } + } + + if (regex_match && update) + { + try + { + run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), + test_case.test_parameters(), &results, test_case.puppet_master_name()); + } + catch (std::string &error) + { + std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl; + } + } } - } // Output results according to provided options - if(json_filename != "") { - write_json(json_file, results); - json_file.close(); - } + if (json_filename != "") + { + write_json(json_file, results); + json_file.close(); + } - if(!dry_run) { - if(config_file != "") write_results(&results, false, config_file); - else write_results(&results, false); - } - else { - std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; - } + if (!dry_run) + { + if (config_file != "") + write_results(&results, false, config_file); + else + write_results(&results, false); + } + else + { + std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; + } } @@ -167,51 +186,55 @@ void read_results(std::vector *results) void read_results(std::vector *results, std::string path) { struct stat buffer; - bool config_status = (stat (path.c_str(), &buffer) == 0); + bool config_status = (stat(path.c_str(), &buffer) == 0); - if( config_status ) { - // a config exists and we are reading results from it - std::ifstream config(path.c_str()); - char config_line[256]; - while(config.getline(config_line, 255)) { - // tokenize the input line by kernel_name unaligned aligned - // then push back in the results vector with fields filled in + if (config_status) + { + // a config exists and we are reading results from it + std::ifstream config(path.c_str()); + char config_line[256]; + while (config.getline(config_line, 255)) + { + // tokenize the input line by kernel_name unaligned aligned + // then push back in the results vector with fields filled in - std::vector single_kernel_result; - std::string config_str(config_line); - std::size_t str_size = config_str.size(); - std::size_t found = 1; + std::vector single_kernel_result; + std::string config_str(config_line); + std::size_t str_size = config_str.size(); + std::size_t found = 1; - found = config_str.find(' '); - // Split line by spaces - while(found && found < str_size) { found = config_str.find(' '); - // kernel names MUST be less than 128 chars, which is - // a length restricted by volk/volk_prefs.c - // on the last token in the parsed string we won't find a space - // so make sure we copy at most 128 chars. - if(found > 127) { - found = 127; - } - str_size = config_str.size(); - char buffer[128] = {'\0'}; - config_str.copy(buffer, found + 1, 0); - buffer[found] = '\0'; - single_kernel_result.push_back(std::string(buffer)); - config_str.erase(0, found+1); - } + // Split line by spaces + while (found && found < str_size) + { + found = config_str.find(' '); + // kernel names MUST be less than 128 chars, which is + // a length restricted by volk/volk_prefs.c + // on the last token in the parsed string we won't find a space + // so make sure we copy at most 128 chars. + if (found > 127) + { + found = 127; + } + str_size = config_str.size(); + char buffer[128] = {'\0'}; + config_str.copy(buffer, found + 1, 0); + buffer[found] = '\0'; + single_kernel_result.push_back(std::string(buffer)); + config_str.erase(0, found + 1); + } - if(single_kernel_result.size() == 3) { - volk_gnsssdr_test_results_t kernel_result; - kernel_result.name = std::string(single_kernel_result[0]); - kernel_result.config_name = std::string(single_kernel_result[0]); - kernel_result.best_arch_u = std::string(single_kernel_result[1]); - kernel_result.best_arch_a = std::string(single_kernel_result[2]); - results->push_back(kernel_result); - } + if (single_kernel_result.size() == 3) + { + volk_gnsssdr_test_results_t kernel_result; + kernel_result.name = std::string(single_kernel_result[0]); + kernel_result.config_name = std::string(single_kernel_result[0]); + kernel_result.best_arch_u = std::string(single_kernel_result[1]); + kernel_result.best_arch_a = std::string(single_kernel_result[2]); + results->push_back(kernel_result); + } + } } - } - } void write_results(const std::vector *results, bool update_result) @@ -219,7 +242,7 @@ void write_results(const std::vector *results, bool char path[1024]; volk_gnsssdr_get_config_path(path); - write_results( results, update_result, std::string(path)); + write_results(results, update_result, std::string(path)); } void write_results(const std::vector *results, bool update_result, const std::string path) @@ -227,39 +250,44 @@ void write_results(const std::vector *results, bool const fs::path config_path(path); // Until we can update the config on a kernel by kernel basis // do not overwrite volk_gnsssdr_config when using a regex. - if (! fs::exists(config_path.branch_path())) - { - std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl; - fs::create_directories(config_path.branch_path()); - } + if (!fs::exists(config_path.branch_path())) + { + std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl; + fs::create_directories(config_path.branch_path()); + } std::ofstream config; - if(update_result) { - std::cout << "Updating " << path << " ..." << std::endl; - config.open(path.c_str(), std::ofstream::app); - if (!config.is_open()) { //either we don't have write access or we don't have the dir yet - std::cout << "Error opening file " << path << std::endl; - } - } - else { - std::cout << "Writing " << path << " ..." << std::endl; - config.open(path.c_str()); - if (!config.is_open()) { //either we don't have write access or we don't have the dir yet - std::cout << "Error opening file " << path << std::endl; + if (update_result) + { + std::cout << "Updating " << path << " ..." << std::endl; + config.open(path.c_str(), std::ofstream::app); + if (!config.is_open()) + { //either we don't have write access or we don't have the dir yet + std::cout << "Error opening file " << path << std::endl; + } } + else + { + std::cout << "Writing " << path << " ..." << std::endl; + config.open(path.c_str()); + if (!config.is_open()) + { //either we don't have write access or we don't have the dir yet + std::cout << "Error opening file " << path << std::endl; + } - config << "\ + config << "\ #this file is generated by volk_gnsssdr_profile.\n\ #the function name is followed by the preferred architecture.\n\ "; - } + } std::vector::const_iterator profile_results; - for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) { - config << profile_results->config_name << " " - << profile_results->best_arch_a << " " - << profile_results->best_arch_u << std::endl; - } + for (profile_results = results->begin(); profile_results != results->end(); ++profile_results) + { + config << profile_results->config_name << " " + << profile_results->best_arch_a << " " + << profile_results->best_arch_u << std::endl; + } config.close(); } @@ -270,43 +298,45 @@ void write_json(std::ofstream &json_file, std::vector::iterator result; - for(result = results.begin(); result != results.end(); ++result) { - json_file << " {" << std::endl; - json_file << " \"name\": \"" << result->name << "\"," << std::endl; - json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; - json_file << " \"iter\": " << result->iter << "," << std::endl; - json_file << " \"best_arch_a\": \"" << result->best_arch_a - << "\"," << std::endl; - json_file << " \"best_arch_u\": \"" << result->best_arch_u - << "\"," << std::endl; - json_file << " \"results\": {" << std::endl; - size_t results_len = result->results.size(); - size_t ri = 0; + for (result = results.begin(); result != results.end(); ++result) + { + json_file << " {" << std::endl; + json_file << " \"name\": \"" << result->name << "\"," << std::endl; + json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; + json_file << " \"iter\": " << result->iter << "," << std::endl; + json_file << " \"best_arch_a\": \"" << result->best_arch_a + << "\"," << std::endl; + json_file << " \"best_arch_u\": \"" << result->best_arch_u + << "\"," << std::endl; + json_file << " \"results\": {" << std::endl; + size_t results_len = result->results.size(); + size_t ri = 0; - std::map::iterator kernel_time_pair; - for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) { - volk_gnsssdr_test_time_t time = kernel_time_pair->second; - json_file << " \"" << time.name << "\": {" << std::endl; - json_file << " \"name\": \"" << time.name << "\"," << std::endl; - json_file << " \"time\": " << time.time << "," << std::endl; - json_file << " \"units\": \"" << time.units << "\"" << std::endl; - json_file << " }" ; - if(ri+1 != results_len) { - json_file << ","; - } + std::map::iterator kernel_time_pair; + for (kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) + { + volk_gnsssdr_test_time_t time = kernel_time_pair->second; + json_file << " \"" << time.name << "\": {" << std::endl; + json_file << " \"name\": \"" << time.name << "\"," << std::endl; + json_file << " \"time\": " << time.time << "," << std::endl; + json_file << " \"units\": \"" << time.units << "\"" << std::endl; + json_file << " }"; + if (ri + 1 != results_len) + { + json_file << ","; + } + json_file << std::endl; + ri++; + } + json_file << " }" << std::endl; + json_file << " }"; + if (i + 1 != len) + { + json_file << ","; + } json_file << std::endl; - ri++; + i++; } - json_file << " }" << std::endl; - json_file << " }"; - if(i+1 != len) { - json_file << ","; - } - json_file << std::endl; - i++; - } json_file << " ]" << std::endl; json_file << "}" << std::endl; } - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.h index 26ff1249b..0b1a6a46e 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.h @@ -27,10 +27,10 @@ * ------------------------------------------------------------------------- */ -#include // for bool -#include // for ofstream -#include // for string -#include // for vector +#include // for bool +#include // for ofstream +#include // for string +#include // for vector class volk_test_results_t; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturation_arithmetic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturation_arithmetic.h index 194bb46e3..77a6cc84d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturation_arithmetic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturation_arithmetic.h @@ -29,7 +29,7 @@ static inline int16_t sat_adds16i(int16_t x, int16_t y) { - int32_t res = (int32_t) x + (int32_t) y; + int32_t res = (int32_t)x + (int32_t)y; if (res < SHRT_MIN) res = SHRT_MIN; if (res > SHRT_MAX) res = SHRT_MAX; @@ -39,7 +39,7 @@ static inline int16_t sat_adds16i(int16_t x, int16_t y) static inline int16_t sat_muls16i(int16_t x, int16_t y) { - int32_t res = (int32_t) x * (int32_t) y; + int32_t res = (int32_t)x * (int32_t)y; if (res < SHRT_MIN) res = SHRT_MIN; if (res > SHRT_MAX) res = SHRT_MAX; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h index 809aa98f9..dbb67f986 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h @@ -30,38 +30,42 @@ static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) { - __m256 yl, yh, tmp1, tmp2; - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... - tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... - x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... - tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + __m256 yl, yh, tmp1, tmp2; + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di } static inline __m256 -_mm256_conjugate_ps(__m256 x){ - const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); - return _mm256_xor_ps(x, conjugator); // conjugate y +_mm256_conjugate_ps(__m256 x) +{ + const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + return _mm256_xor_ps(x, conjugator); // conjugate y } static inline __m256 -_mm256_complexconjugatemul_ps(__m256 x, __m256 y){ - y = _mm256_conjugate_ps(y); - return _mm256_complexmul_ps(x, y); +_mm256_complexconjugatemul_ps(__m256 x, __m256 y) +{ + y = _mm256_conjugate_ps(y); + return _mm256_complexmul_ps(x, y); } static inline __m256 -_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){ - __m256 complex1, complex2; - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values +_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2) +{ + __m256 complex1, complex2; + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values } -static inline __m256 _mm256_complexnormalise_ps( __m256 z ){ +static inline __m256 _mm256_complexnormalise_ps(__m256 z) +{ __m256 tmp1 = _mm256_mul_ps(z, z); __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1); tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); @@ -70,8 +74,9 @@ static inline __m256 _mm256_complexnormalise_ps( __m256 z ){ } static inline __m256 -_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){ - return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); +_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2) +{ + return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); } #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h index 24b6501b8..d97ce89b1 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h @@ -28,14 +28,14 @@ // Cross-platform attribute macros not included in VOLK //////////////////////////////////////////////////////////////////////// #if defined __GNUC__ -# define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) -# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) +#define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) +#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) #elif _MSC_VER -# define __VOLK_GNSSSDR_PREFETCH(addr) -# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) +#define __VOLK_GNSSSDR_PREFETCH(addr) +#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) #else -# define __VOLK_GNSSSDR_PREFETCH(addr) -# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) +#define __VOLK_GNSSSDR_PREFETCH(addr) +#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) #endif #ifndef INCLUDED_LIBVOLK_COMMON_H @@ -45,45 +45,45 @@ // Cross-platform attribute macros //////////////////////////////////////////////////////////////////////// #if defined __GNUC__ -# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) -# define __VOLK_ATTR_UNUSED __attribute__((unused)) -# define __VOLK_ATTR_INLINE __attribute__((always_inline)) -# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) -# define __VOLK_ASM __asm__ -# define __VOLK_VOLATILE __volatile__ -# if __GNUC__ >= 4 -# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) -# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) -# else -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -# endif -#elif _MSC_VER -# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE __forceinline -# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) -# define __VOLK_ATTR_EXPORT __declspec(dllexport) -# define __VOLK_ATTR_IMPORT __declspec(dllimport) -# define __VOLK_ASM __asm -# define __VOLK_VOLATILE +#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +#define __VOLK_ATTR_UNUSED __attribute__((unused)) +#define __VOLK_ATTR_INLINE __attribute__((always_inline)) +#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +#define __VOLK_ASM __asm__ +#define __VOLK_VOLATILE __volatile__ +#if __GNUC__ >= 4 +#define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +#define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) #else -# define __VOLK_ATTR_ALIGNED(x) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE -# define __VOLK_ATTR_DEPRECATED -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -# define __VOLK_ASM __asm__ -# define __VOLK_VOLATILE __volatile__ +#define __VOLK_ATTR_EXPORT +#define __VOLK_ATTR_IMPORT +#endif +#elif _MSC_VER +#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +#define __VOLK_ATTR_UNUSED +#define __VOLK_ATTR_INLINE __forceinline +#define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +#define __VOLK_ATTR_EXPORT __declspec(dllexport) +#define __VOLK_ATTR_IMPORT __declspec(dllimport) +#define __VOLK_ASM __asm +#define __VOLK_VOLATILE +#else +#define __VOLK_ATTR_ALIGNED(x) +#define __VOLK_ATTR_UNUSED +#define __VOLK_ATTR_INLINE +#define __VOLK_ATTR_DEPRECATED +#define __VOLK_ATTR_EXPORT +#define __VOLK_ATTR_IMPORT +#define __VOLK_ASM __asm__ +#define __VOLK_VOLATILE __volatile__ #endif //////////////////////////////////////////////////////////////////////// // Ignore annoying warnings in MSVC //////////////////////////////////////////////////////////////////////// #if defined(_MSC_VER) -# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data -# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' +#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data +#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2' #endif //////////////////////////////////////////////////////////////////////// @@ -91,11 +91,13 @@ // FIXME: due to the usage of complex.h, require gcc for c-linkage //////////////////////////////////////////////////////////////////////// #if defined(__cplusplus) && (__GNUC__) -# define __VOLK_DECL_BEGIN extern "C" { -# define __VOLK_DECL_END } +#define __VOLK_DECL_BEGIN \ + extern "C" \ + { +#define __VOLK_DECL_END } #else -# define __VOLK_DECL_BEGIN -# define __VOLK_DECL_END +#define __VOLK_DECL_BEGIN +#define __VOLK_DECL_END #endif //////////////////////////////////////////////////////////////////////// @@ -103,9 +105,9 @@ // http://gcc.gnu.org/wiki/Visibility //////////////////////////////////////////////////////////////////////// #ifdef volk_gnsssdr_EXPORTS -# define VOLK_API __VOLK_ATTR_EXPORT +#define VOLK_API __VOLK_ATTR_EXPORT #else -# define VOLK_API __VOLK_ATTR_IMPORT +#define VOLK_API __VOLK_ATTR_IMPORT #endif //////////////////////////////////////////////////////////////////////// @@ -121,35 +123,37 @@ #endif #endif -union bit128{ - uint8_t i8[16]; - uint16_t i16[8]; - uint32_t i[4]; - float f[4]; - double d[2]; +union bit128 +{ + uint8_t i8[16]; + uint16_t i16[8]; + uint32_t i[4]; + float f[4]; + double d[2]; - #ifdef LV_HAVE_SSE - __m128 float_vec; - #endif +#ifdef LV_HAVE_SSE + __m128 float_vec; +#endif - #ifdef LV_HAVE_SSE2 - __m128i int_vec; - __m128d double_vec; - #endif +#ifdef LV_HAVE_SSE2 + __m128i int_vec; + __m128d double_vec; +#endif }; -union bit256{ - uint8_t i8[32]; - uint16_t i16[16]; - uint32_t i[8]; - float f[8]; - double d[4]; +union bit256 +{ + uint8_t i8[32]; + uint16_t i16[16]; + uint32_t i[8]; + float f[8]; + double d[4]; - #ifdef LV_HAVE_AVX - __m256 float_vec; - __m256i int_vec; - __m256d double_vec; - #endif +#ifdef LV_HAVE_AVX + __m256 float_vec; + __m256i int_vec; + __m256d double_vec; +#endif }; #define bit128_p(x) ((union bit128 *)(x)) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h index 237266679..648eb26f9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h @@ -48,26 +48,34 @@ #include #include -typedef std::complex lv_8sc_t; +typedef std::complex lv_8sc_t; typedef std::complex lv_16sc_t; typedef std::complex lv_32sc_t; typedef std::complex lv_64sc_t; -typedef std::complex lv_32fc_t; -typedef std::complex lv_64fc_t; +typedef std::complex lv_32fc_t; +typedef std::complex lv_64fc_t; -template inline std::complex lv_cmake(const T &r, const T &i){ +template +inline std::complex lv_cmake(const T &r, const T &i) +{ return std::complex(r, i); } -template inline typename T::value_type lv_creal(const T &x){ +template +inline typename T::value_type lv_creal(const T &x) +{ return x.real(); } -template inline typename T::value_type lv_cimag(const T &x){ +template +inline typename T::value_type lv_cimag(const T &x) +{ return x.imag(); } -template inline T lv_conj(const T &x){ +template +inline T lv_conj(const T &x) +{ return std::conj(x); } @@ -80,14 +88,14 @@ template inline T lv_conj(const T &x){ #include -typedef char complex lv_8sc_t; -typedef short complex lv_16sc_t; -typedef long complex lv_32sc_t; -typedef long long complex lv_64sc_t; -typedef float complex lv_32fc_t; -typedef double complex lv_64fc_t; +typedef char complex lv_8sc_t; +typedef short complex lv_16sc_t; +typedef long complex lv_32sc_t; +typedef long long complex lv_64sc_t; +typedef float complex lv_32fc_t; +typedef double complex lv_64fc_t; -#define lv_cmake(r, i) ((r) + _Complex_I*(i)) +#define lv_cmake(r, i) ((r) + _Complex_I * (i)) // When GNUC is available, use the complex extensions. // The extensions always return the correct value type. diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h index 49aa561d1..0de07d600 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h @@ -27,30 +27,30 @@ #include -static inline float32x4_t vdivq_f32( float32x4_t num, float32x4_t den ) +static inline float32x4_t vdivq_f32(float32x4_t num, float32x4_t den) { - const float32x4_t q_inv0 = vrecpeq_f32( den ); - const float32x4_t q_step0 = vrecpsq_f32( q_inv0, den ); + const float32x4_t q_inv0 = vrecpeq_f32(den); + const float32x4_t q_step0 = vrecpsq_f32(q_inv0, den); - const float32x4_t q_inv1 = vmulq_f32( q_step0, q_inv0 ); - return vmulq_f32( num, q_inv1 ); + const float32x4_t q_inv1 = vmulq_f32(q_step0, q_inv0); + return vmulq_f32(num, q_inv1); } -static inline float32x4_t vsqrtq_f32( float32x4_t q_x ) +static inline float32x4_t vsqrtq_f32(float32x4_t q_x) { - const float32x4_t q_step_0 = vrsqrteq_f32( q_x ); + const float32x4_t q_step_0 = vrsqrteq_f32(q_x); // step - const float32x4_t q_step_parm0 = vmulq_f32( q_x, q_step_0 ); - const float32x4_t q_step_result0 = vrsqrtsq_f32( q_step_parm0, q_step_0 ); + const float32x4_t q_step_parm0 = vmulq_f32(q_x, q_step_0); + const float32x4_t q_step_result0 = vrsqrtsq_f32(q_step_parm0, q_step_0); // step - const float32x4_t q_step_1 = vmulq_f32( q_step_0, q_step_result0 ); - const float32x4_t q_step_parm1 = vmulq_f32( q_x, q_step_1 ); - const float32x4_t q_step_result1 = vrsqrtsq_f32( q_step_parm1, q_step_1 ); + const float32x4_t q_step_1 = vmulq_f32(q_step_0, q_step_result0); + const float32x4_t q_step_parm1 = vmulq_f32(q_x, q_step_1); + const float32x4_t q_step_result1 = vrsqrtsq_f32(q_step_parm1, q_step_1); // take the res - const float32x4_t q_step_2 = vmulq_f32( q_step_1, q_step_result1 ); + const float32x4_t q_step_2 = vmulq_f32(q_step_1, q_step_result1); // mul by x to get sqrt, not rsqrt - return vmulq_f32( q_x, q_step_2 ); + return vmulq_f32(q_x, q_step_2); } #endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h index bb03e4407..372079450 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h @@ -32,9 +32,9 @@ __VOLK_DECL_BEGIN typedef struct volk_gnsssdr_arch_pref { - char name[128]; //name of the kernel - char impl_a[128]; //best aligned impl - char impl_u[128]; //best unaligned impl + char name[128]; //name of the kernel + char impl_a[128]; //best aligned impl + char impl_u[128]; //best unaligned impl } volk_gnsssdr_arch_pref_t; //////////////////////////////////////////////////////////////////////// diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sine_table.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sine_table.h index 90bd78569..4ba0bb631 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sine_table.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sine_table.h @@ -29,1030 +29,1030 @@ * max_error = 2.353084136763606e-06 */ static const float sine_table_10bits[1 << 10][2] = { -{ 2.925817799165007e-09, 7.219194364267018e-09 }, - { 2.925707643778599e-09, 2.526699001579799e-07 }, - { 2.925487337153070e-09, 1.191140162167675e-06 }, - { 2.925156887582842e-09, 3.284585035595589e-06 }, - { 2.924716307509151e-09, 6.994872605695784e-06 }, - { 2.924165613519592e-09, 1.278374920658798e-05 }, - { 2.923504826347475e-09, 2.111280464718590e-05 }, - { 2.922733970871080e-09, 3.244343744537165e-05 }, - { 2.921853076112655e-09, 4.723682007436170e-05 }, - { 2.920862175237416e-09, 6.595386421935634e-05 }, - { 2.919761305552202e-09, 8.905518605213658e-05 }, - { 2.918550508504146e-09, 1.170010715193098e-04 }, - { 2.917229829679050e-09, 1.502514416517192e-04 }, - { 2.915799318799769e-09, 1.892658178912071e-04 }, - { 2.914259029724184e-09, 2.345032874456615e-04 }, - { 2.912609020443340e-09, 2.864224686607020e-04 }, - { 2.910849353079123e-09, 3.454814764261432e-04 }, - { 2.908980093882049e-09, 4.121378876027343e-04 }, - { 2.907001313228646e-09, 4.868487064877691e-04 }, - { 2.904913085618902e-09, 5.700703303049837e-04 }, - { 2.902715489673383e-09, 6.622585147355725e-04 }, - { 2.900408608130373e-09, 7.638683394782519e-04 }, - { 2.897992527842612e-09, 8.753541738578119e-04 }, - { 2.895467339774186e-09, 9.971696424604937e-04 }, - { 2.892833138996999e-09, 1.129767590823255e-03 }, - { 2.890090024687216e-09, 1.273600051161478e-03 }, - { 2.887238100121550e-09, 1.429118208142094e-03 }, - { 2.884277472673313e-09, 1.596772364709564e-03 }, - { 2.881208253808507e-09, 1.777011907950626e-03 }, - { 2.878030559081432e-09, 1.970285275029487e-03 }, - { 2.874744508130554e-09, 2.177039919152579e-03 }, - { 2.871350224673798e-09, 2.397722275614272e-03 }, - { 2.867847836504030e-09, 2.632777727878843e-03 }, - { 2.864237475484149e-09, 2.882650573737405e-03 }, - { 2.860519277542297e-09, 3.147783991507308e-03 }, - { 2.856693382666432e-09, 3.428620006328931e-03 }, - { 2.852759934899389e-09, 3.725599456482154e-03 }, - { 2.848719082333207e-09, 4.039161959812243e-03 }, - { 2.844570977103752e-09, 4.369745880190706e-03 }, - { 2.840315775384800e-09, 4.717788294077374e-03 }, - { 2.835953637382310e-09, 5.083724957128360e-03 }, - { 2.831484727328322e-09, 5.467990270896617e-03 }, - { 2.826909213474759e-09, 5.871017249604038e-03 }, - { 2.822227268087134e-09, 6.293237486988512e-03 }, - { 2.817439067438018e-09, 6.735081123237729e-03 }, - { 2.812544791800534e-09, 7.196976811989608e-03 }, - { 2.807544625441273e-09, 7.679351687456759e-03 }, - { 2.802438756613836e-09, 8.182631331563162e-03 }, - { 2.797227377551135e-09, 8.707239741274575e-03 }, - { 2.791910684458716e-09, 9.253599295902304e-03 }, - { 2.786488877507140e-09, 9.822130724578715e-03 }, - { 2.780962160824228e-09, 1.041325307382490e-02 }, - { 2.775330742487884e-09, 1.102738367513773e-02 }, - { 2.769594834517682e-09, 1.166493811278924e-02 }, - { 2.763754652867477e-09, 1.232633019159818e-02 }, - { 2.757810417416620e-09, 1.301197190494069e-02 }, - { 2.751762351962413e-09, 1.372227340270610e-02 }, - { 2.745610684210923e-09, 1.445764295952962e-02 }, - { 2.739355645769094e-09, 1.521848694296229e-02 }, - { 2.732997472135539e-09, 1.600520978188769e-02 }, - { 2.726536402691907e-09, 1.681821393496225e-02 }, - { 2.719972680693777e-09, 1.765789985920713e-02 }, - { 2.713306553261610e-09, 1.852466597868779e-02 }, - { 2.706538271371373e-09, 1.941890865333146e-02 }, - { 2.699668089844909e-09, 2.034102214787814e-02 }, - { 2.692696267340880e-09, 2.129139860085272e-02 }, - { 2.685623066344263e-09, 2.227042799383416e-02 }, - { 2.678448753157212e-09, 2.327849812064098e-02 }, - { 2.671173597888530e-09, 2.431599455681316e-02 }, - { 2.663797874443630e-09, 2.538330062913108e-02 }, - { 2.656321860514457e-09, 2.648079738524795e-02 }, - { 2.648745837568575e-09, 2.760886356354952e-02 }, - { 2.641070090839117e-09, 2.876787556300114e-02 }, - { 2.633294909313421e-09, 2.995820741329835e-02 }, - { 2.625420585722845e-09, 3.118023074495535e-02 }, - { 2.617447416531143e-09, 3.243431475972608e-02 }, - { 2.609375701923643e-09, 3.372082620101990e-02 }, - { 2.601205745795833e-09, 3.504012932452527e-02 }, - { 2.592937855741933e-09, 3.639258586895711e-02 }, - { 2.584572343043400e-09, 3.777855502693250e-02 }, - { 2.576109522656942e-09, 3.919839341605197e-02 }, - { 2.567549713203028e-09, 4.065245505002102e-02 }, - { 2.558893236953688e-09, 4.214109131001403e-02 }, - { 2.550140419820252e-09, 4.366465091617666e-02 }, - { 2.541291591341445e-09, 4.522347989919473e-02 }, - { 2.532347084670572e-09, 4.681792157215026e-02 }, - { 2.523307236563343e-09, 4.844831650239501e-02 }, - { 2.514172387364900e-09, 5.011500248369893e-02 }, - { 2.504942880997064e-09, 5.181831450849345e-02 }, - { 2.495619064945627e-09, 5.355858474024022e-02 }, - { 2.486201290246928e-09, 5.533614248606705e-02 }, - { 2.476689911475047e-09, 5.715131416942842e-02 }, - { 2.467085286727668e-09, 5.900442330315692e-02 }, - { 2.457387777613798e-09, 6.089579046229943e-02 }, - { 2.447597749239101e-09, 6.282573325755320e-02 }, - { 2.437715570192557e-09, 6.479456630859221e-02 }, - { 2.427741612532542e-09, 6.680260121764925e-02 }, - { 2.417676251773166e-09, 6.885014654319160e-02 }, - { 2.407519866869294e-09, 7.093750777401114e-02 }, - { 2.397272840203310e-09, 7.306498730310884e-02 }, - { 2.386935557569868e-09, 7.523288440214027e-02 }, - { 2.376508408161815e-09, 7.744149519577415e-02 }, - { 2.365991784555363e-09, 7.969111263635709e-02 }, - { 2.355386082695641e-09, 8.198202647865405e-02 }, - { 2.344691701881232e-09, 8.431452325495814e-02 }, - { 2.333909044749407e-09, 8.668888625021409e-02 }, - { 2.323038517261246e-09, 8.910539547731611e-02 }, - { 2.312080528685971e-09, 9.156432765274414e-02 }, - { 2.301035491585642e-09, 9.406595617227698e-02 }, - { 2.289903821799651e-09, 9.661055108691619e-02 }, - { 2.278685938428940e-09, 9.919837907903295e-02 }, - { 2.267382263820762e-09, 1.018297034385580e-01 }, - { 2.255993223551837e-09, 1.045047840397028e-01 }, - { 2.244519246413220e-09, 1.072238773174577e-01 }, - { 2.232960764393620e-09, 1.099872362446146e-01 }, - { 2.221318212663309e-09, 1.127951103088245e-01 }, - { 2.209592029557811e-09, 1.156477454898748e-01 }, - { 2.197782656561395e-09, 1.185453842371912e-01 }, - { 2.185890538290176e-09, 1.214882654476019e-01 }, - { 2.173916122475606e-09, 1.244766244431883e-01 }, - { 2.161859859947797e-09, 1.275106929493488e-01 }, - { 2.149722204618256e-09, 1.305906990731841e-01 }, - { 2.137503613462743e-09, 1.337168672820376e-01 }, - { 2.125204546504321e-09, 1.368894183821595e-01 }, - { 2.112825466795944e-09, 1.401085694976751e-01 }, - { 2.100366840402933e-09, 1.433745340497602e-01 }, - { 2.087829136385612e-09, 1.466875217359607e-01 }, - { 2.075212826781308e-09, 1.500477385098620e-01 }, - { 2.062518386587093e-09, 1.534553865607503e-01 }, - { 2.049746293741359e-09, 1.569106642937665e-01 }, - { 2.036897029106193e-09, 1.604137663100403e-01 }, - { 2.023971076449323e-09, 1.639648833871233e-01 }, - { 2.010968922425217e-09, 1.675642024598467e-01 }, - { 1.997891056557933e-09, 1.712119066008896e-01 }, - { 1.984737971221581e-09, 1.749081750021970e-01 }, - { 1.971510161622434e-09, 1.786531829561379e-01 }, - { 1.958208125780130e-09, 1.824471018371070e-01 }, - { 1.944832364508511e-09, 1.862900990834311e-01 }, - { 1.931383381397782e-09, 1.901823381790926e-01 }, - { 1.917861682794392e-09, 1.941239786363039e-01 }, - { 1.904267777782611e-09, 1.981151759777950e-01 }, - { 1.890602178165317e-09, 2.021560817195309e-01 }, - { 1.876865398444616e-09, 2.062468433536743e-01 }, - { 1.863057955802572e-09, 2.103876043317229e-01 }, - { 1.849180370081465e-09, 2.145785040479915e-01 }, - { 1.835233163764673e-09, 2.188196778231083e-01 }, - { 1.821216861956509e-09, 2.231112568880342e-01 }, - { 1.807131992362945e-09, 2.274533683680190e-01 }, - { 1.792979085271234e-09, 2.318461352671018e-01 }, - { 1.778758673530482e-09, 2.362896764525300e-01 }, - { 1.764471292530943e-09, 2.407841066397789e-01 }, - { 1.750117480184598e-09, 2.453295363773890e-01 }, - { 1.735697776904342e-09, 2.499260720324433e-01 }, - { 1.721212725583874e-09, 2.545738157760434e-01 }, - { 1.706662871577097e-09, 2.592728655691494e-01 }, - { 1.692048762677849e-09, 2.640233151485341e-01 }, - { 1.677370949099090e-09, 2.688252540131204e-01 }, - { 1.662629983452104e-09, 2.736787674105404e-01 }, - { 1.647826420726167e-09, 2.785839363237506e-01 }, - { 1.632960818266680e-09, 2.835408374583758e-01 }, - { 1.618033735755429e-09, 2.885495432295704e-01 }, - { 1.603045735188609e-09, 2.936101217498361e-01 }, - { 1.587997380855918e-09, 2.987226368167127e-01 }, - { 1.572889239319430e-09, 3.038871479007593e-01 }, - { 1.557721879392051e-09, 3.091037101339017e-01 }, - { 1.542495872116447e-09, 3.143723742978435e-01 }, - { 1.527211790743024e-09, 3.196931868130269e-01 }, - { 1.511870210708909e-09, 3.250661897274744e-01 }, - { 1.496471709615926e-09, 3.304914207062036e-01 }, - { 1.481016867208896e-09, 3.359689130207621e-01 }, - { 1.465506265353924e-09, 3.414986955389885e-01 }, - { 1.449940488016384e-09, 3.470807927151147e-01 }, - { 1.434320121238994e-09, 3.527152245800635e-01 }, - { 1.418645753119802e-09, 3.584020067320109e-01 }, - { 1.402917973789838e-09, 3.641411503272979e-01 }, - { 1.387137375391042e-09, 3.699326620714776e-01 }, - { 1.371304552054134e-09, 3.757765442106153e-01 }, - { 1.355420099875958e-09, 3.816727945230153e-01 }, - { 1.339484616897137e-09, 3.876214063110671e-01 }, - { 1.323498703079580e-09, 3.936223683933865e-01 }, - { 1.307462960283922e-09, 3.996756650972121e-01 }, - { 1.291377992246768e-09, 4.057812762511174e-01 }, - { 1.275244404558188e-09, 4.119391771778626e-01 }, - { 1.259062804638585e-09, 4.181493386877248e-01 }, - { 1.242833801715929e-09, 4.244117270719281e-01 }, - { 1.226558006803155e-09, 4.307263040962509e-01 }, - { 1.210236032674760e-09, 4.370930269951803e-01 }, - { 1.193868493843725e-09, 4.435118484661861e-01 }, - { 1.177456006538695e-09, 4.499827166641340e-01 }, - { 1.160999188680582e-09, 4.565055751961679e-01 }, - { 1.144498659859216e-09, 4.630803631168164e-01 }, - { 1.127955041310214e-09, 4.697070149232604e-01 }, - { 1.111368955891417e-09, 4.763854605510119e-01 }, - { 1.094741028059551e-09, 4.831156253697562e-01 }, - { 1.078071883846871e-09, 4.898974301794375e-01 }, - { 1.061362150836978e-09, 4.967307912069362e-01 }, - { 1.044612458142151e-09, 5.036156201023686e-01 }, - { 1.027823436378632e-09, 5.105518239364775e-01 }, - { 1.010995717643647e-09, 5.175393051975563e-01 }, - { 9.941299354913699e-10, 5.245779617890562e-01 }, - { 9.772267249089968e-10, 5.316676870274011e-01 }, - { 9.602867222926046e-10, 5.388083696401416e-01 }, - { 9.433105654240147e-10, 5.459998937639375e-01 }, - { 9.262988934458084e-10, 5.532421389435711e-01 }, - { 9.092523468378193e-10, 5.605349801305876e-01 }, - { 8.921715673928355e-10, 5.678782876825250e-01 }, - { 8.750571981926701e-10, 5.752719273622372e-01 }, - { 8.579098835836508e-10, 5.827157603377209e-01 }, - { 8.407302691522673e-10, 5.902096431821322e-01 }, - { 8.235190017016133e-10, 5.977534278737073e-01 }, - { 8.062767292259225e-10, 6.053469617967722e-01 }, - { 7.890041008871165e-10, 6.129900877421282e-01 }, - { 7.717017669898175e-10, 6.206826439083659e-01 }, - { 7.543703789572603e-10, 6.284244639030392e-01 }, - { 7.370105893063053e-10, 6.362153767444958e-01 }, - { 7.196230516231919e-10, 6.440552068636356e-01 }, - { 7.022084205389746e-10, 6.519437741060674e-01 }, - { 6.847673517046416e-10, 6.598808937346672e-01 }, - { 6.673005017664976e-10, 6.678663764322770e-01 }, - { 6.498085283416530e-10, 6.759000283046127e-01 }, - { 6.322920899929834e-10, 6.839816508836737e-01 }, - { 6.147518462045659e-10, 6.921110411311926e-01 }, - { 5.971884573565851e-10, 7.002879914425926e-01 }, - { 5.796025847007168e-10, 7.085122896509806e-01 }, - { 5.619948903351406e-10, 7.167837190315758e-01 }, - { 5.443660371796048e-10, 7.251020583063744e-01 }, - { 5.267166889504394e-10, 7.334670816491009e-01 }, - { 5.090475101356742e-10, 7.418785586903696e-01 }, - { 4.913591659698399e-10, 7.503362545232619e-01 }, - { 4.736523224091392e-10, 7.588399297089872e-01 }, - { 4.559276461062478e-10, 7.673893402829834e-01 }, - { 4.381858043851147e-10, 7.759842377612828e-01 }, - { 4.204274652161870e-10, 7.846243691469355e-01 }, - { 4.026532971908398e-10, 7.933094769370790e-01 }, - { 3.848639694963359e-10, 8.020392991300200e-01 }, - { 3.670601518910503e-10, 8.108135692324444e-01 }, - { 3.492425146784233e-10, 8.196320162675177e-01 }, - { 3.314117286825031e-10, 8.284943647824689e-01 }, - { 3.135684652223755e-10, 8.374003348569865e-01 }, - { 2.957133960867535e-10, 8.463496421118015e-01 }, - { 2.778471935089361e-10, 8.553419977173513e-01 }, - { 2.599705301412391e-10, 8.643771084029740e-01 }, - { 2.420840790301135e-10, 8.734546764660205e-01 }, - { 2.241885135902046e-10, 8.825743997817682e-01 }, - { 2.062845075795238e-10, 8.917359718130367e-01 }, - { 1.883727350736140e-10, 9.009390816205823e-01 }, - { 1.704538704408269e-10, 9.101834138731877e-01 }, - { 1.525285883160648e-10, 9.194686488588080e-01 }, - { 1.345975635762696e-10, 9.287944624950824e-01 }, - { 1.166614713141648e-10, 9.381605263410157e-01 }, - { 9.872098681369190e-11, 9.475665076080466e-01 }, - { 8.077678552380464e-11, 9.570120691722380e-01 }, - { 6.282954303364090e-11, 9.664968695860140e-01 }, - { 4.487993504668797e-11, 9.760205630906909e-01 }, - { 2.692863735553042e-11, 9.855827996289697e-01 }, - { 8.976325816439114e-12, 9.951832248577780e-01 }, - { -8.976323676304494e-12, 1.004821480161519e+00 }, - { -2.692863521550168e-11, 1.014497202665280e+00 }, - { -4.487993290681805e-11, 1.024210025248670e+00 }, - { -6.282954089398273e-11, 1.033959576559617e+00 }, - { -8.077678338451706e-11, 1.043745481028715e+00 }, - { -9.872098467477489e-11, 1.053567358883467e+00 }, - { -1.166614691757772e-10, 1.063424826163223e+00 }, - { -1.345975614383584e-10, 1.073317494734013e+00 }, - { -1.525285861788948e-10, 1.083244972303963e+00 }, - { -1.704538683042922e-10, 1.093206862438572e+00 }, - { -1.883727329379793e-10, 1.103202764576806e+00 }, - { -2.062845054446831e-10, 1.113232274046796e+00 }, - { -2.241885114563697e-10, 1.123294982082432e+00 }, - { -2.420840768973375e-10, 1.133390475839767e+00 }, - { -2.599705280096278e-10, 1.143518338413855e+00 }, - { -2.778471913784365e-10, 1.153678148855860e+00 }, - { -2.957133939575774e-10, 1.163869482190458e+00 }, - { -3.135684630945758e-10, 1.174091909433296e+00 }, - { -3.314117265561857e-10, 1.184344997608959e+00 }, - { -3.492425125535882e-10, 1.194628309769018e+00 }, - { -3.670601497678034e-10, 1.204941405010466e+00 }, - { -3.848639673748360e-10, 1.215283838494269e+00 }, - { -4.026532950710339e-10, 1.225655161464298e+00 }, - { -4.204274630982869e-10, 1.236054921266445e+00 }, - { -4.381858022691734e-10, 1.246482661367958e+00 }, - { -4.559276439922654e-10, 1.256937921377146e+00 }, - { -4.736523202972214e-10, 1.267420237063216e+00 }, - { -4.913591638600925e-10, 1.277929140376502e+00 }, - { -5.090475080282032e-10, 1.288464159468706e+00 }, - { -5.267166868452449e-10, 1.299024818713528e+00 }, - { -5.443660350768455e-10, 1.309610638727845e+00 }, - { -5.619948882348695e-10, 1.320221136392390e+00 }, - { -5.796025826029868e-10, 1.330855824873457e+00 }, - { -5.971884552615020e-10, 1.341514213644420e+00 }, - { -6.147518441122357e-10, 1.352195808507556e+00 }, - { -6.322920879034590e-10, 1.362900111616144e+00 }, - { -6.498085262549874e-10, 1.373626621496939e+00 }, - { -6.673004996827436e-10, 1.384374833072571e+00 }, - { -6.847673496239581e-10, 1.395144237684605e+00 }, - { -7.022084184613616e-10, 1.405934323116231e+00 }, - { -7.196230495488082e-10, 1.416744573616104e+00 }, - { -7.370105872352039e-10, 1.427574469921397e+00 }, - { -7.543703768894941e-10, 1.438423489281758e+00 }, - { -7.717017649255453e-10, 1.449291105483472e+00 }, - { -7.890040988262324e-10, 1.460176788873383e+00 }, - { -8.062767271686383e-10, 1.471080006383765e+00 }, - { -8.235189996479819e-10, 1.482000221556656e+00 }, - { -8.407302671024475e-10, 1.492936894569018e+00 }, - { -8.579098815375368e-10, 1.503889482257845e+00 }, - { -8.750571961505266e-10, 1.514857438145604e+00 }, - { -8.921715653546624e-10, 1.525840212465756e+00 }, - { -9.092523448036167e-10, 1.536837252188703e+00 }, - { -9.262988914157881e-10, 1.547848001047890e+00 }, - { -9.433105633981766e-10, 1.558871899565883e+00 }, - { -9.602867202711075e-10, 1.569908385081254e+00 }, - { -9.772267228916820e-10, 1.580956891774897e+00 }, - { -9.941299334786078e-10, 1.592016850697478e+00 }, - { -1.010995715635332e-09, 1.603087689796053e+00 }, - { -1.027823434374870e-09, 1.614168833942028e+00 }, - { -1.044612456143047e-09, 1.625259704958335e+00 }, - { -1.061362148842745e-09, 1.636359721647526e+00 }, - { -1.078071881857297e-09, 1.647468299819543e+00 }, - { -1.094741026074900e-09, 1.658584852320419e+00 }, - { -1.111368953911690e-09, 1.669708789060341e+00 }, - { -1.127955039335462e-09, 1.680839517042381e+00 }, - { -1.144498657889600e-09, 1.691976440391624e+00 }, - { -1.160999186716154e-09, 1.703118960383971e+00 }, - { -1.177456004579561e-09, 1.714266475475616e+00 }, - { -1.193868491889832e-09, 1.725418381332405e+00 }, - { -1.210236030726319e-09, 1.736574070859850e+00 }, - { -1.226558004860220e-09, 1.747732934232508e+00 }, - { -1.242833799778447e-09, 1.758894358924547e+00 }, - { -1.259062802706714e-09, 1.770057729740021e+00 }, - { -1.275244402631982e-09, 1.781222428842935e+00 }, - { -1.291377990326492e-09, 1.792387835788660e+00 }, - { -1.307462958369363e-09, 1.803553327553897e+00 }, - { -1.323498701170897e-09, 1.814718278568759e+00 }, - { -1.339484614994490e-09, 1.825882060747428e+00 }, - { -1.355420097979292e-09, 1.837044043519582e+00 }, - { -1.371304550163662e-09, 1.848203593862598e+00 }, - { -1.387137373506711e-09, 1.859360076332671e+00 }, - { -1.402917971911754e-09, 1.870512853097495e+00 }, - { -1.418645751248018e-09, 1.881661283967967e+00 }, - { -1.434320119373722e-09, 1.892804726431080e+00 }, - { -1.449940486157623e-09, 1.903942535681972e+00 }, - { -1.465506263501516e-09, 1.915074064656886e+00 }, - { -1.481016865363264e-09, 1.926198664066737e+00 }, - { -1.496471707776859e-09, 1.937315682428795e+00 }, - { -1.511870208876724e-09, 1.948424466101625e+00 }, - { -1.527211788917509e-09, 1.959524359317042e+00 }, - { -1.542495870297867e-09, 1.970614704215133e+00 }, - { -1.557721877580406e-09, 1.981694840876775e+00 }, - { -1.572889237514880e-09, 1.992764107358707e+00 }, - { -1.587997379058514e-09, 2.003821839726753e+00 }, - { -1.603045733398246e-09, 2.014867372090665e+00 }, - { -1.618033733972424e-09, 2.025900036638798e+00 }, - { -1.632960816490822e-09, 2.036919163671778e+00 }, - { -1.647826418957721e-09, 2.047924081638631e+00 }, - { -1.662629981691070e-09, 2.058914117170269e+00 }, - { -1.677370947345626e-09, 2.069888595116115e+00 }, - { -1.692048760931849e-09, 2.080846838577820e+00 }, - { -1.706662869838827e-09, 2.091788168946183e+00 }, - { -1.721212723853279e-09, 2.102711905935372e+00 }, - { -1.735697775181424e-09, 2.113617367619504e+00 }, - { -1.750117478469621e-09, 2.124503870468520e+00 }, - { -1.764471290823748e-09, 2.135370729383332e+00 }, - { -1.778758671831281e-09, 2.146217257733207e+00 }, - { -1.792979083579974e-09, 2.157042767390815e+00 }, - { -1.807131990679890e-09, 2.167846568770014e+00 }, - { -1.821216860281448e-09, 2.178627970860822e+00 }, - { -1.835233162097977e-09, 2.189386281268046e+00 }, - { -1.849180368423027e-09, 2.200120806246095e+00 }, - { -1.863057954152340e-09, 2.210830850737588e+00 }, - { -1.876865396802907e-09, 2.221515718409926e+00 }, - { -1.890602176531920e-09, 2.232174711691990e+00 }, - { -1.904267776157843e-09, 2.242807131812679e+00 }, - { -1.917861681178094e-09, 2.253412278837029e+00 }, - { -1.931383379790273e-09, 2.263989451705295e+00 }, - { -1.944832362909578e-09, 2.274537948269257e+00 }, - { -1.958208124189984e-09, 2.285057065331676e+00 }, - { -1.971510160041235e-09, 2.295546098682665e+00 }, - { -1.984737969649064e-09, 2.306004343138794e+00 }, - { -1.997891054994522e-09, 2.316431092581699e+00 }, - { -2.010968920870647e-09, 2.326825639994779e+00 }, - { -2.023971074903858e-09, 2.337187277503834e+00 }, - { -2.036897027569834e-09, 2.347515296413520e+00 }, - { -2.049746292214264e-09, 2.357808987247877e+00 }, - { -2.062518385069210e-09, 2.368067639787542e+00 }, - { -2.075212825272584e-09, 2.378290543109652e+00 }, - { -2.087829134886364e-09, 2.388476985626922e+00 }, - { -2.100366838912949e-09, 2.398626255125417e+00 }, - { -2.112825465315542e-09, 2.408737638805759e+00 }, - { -2.125204545033289e-09, 2.418810423320288e+00 }, - { -2.137503612001452e-09, 2.428843894814472e+00 }, - { -2.149722203166389e-09, 2.438837338964302e+00 }, - { -2.161859858505829e-09, 2.448790041018174e+00 }, - { -2.173916121043380e-09, 2.458701285834241e+00 }, - { -2.185890536867478e-09, 2.468570357921585e+00 }, - { -2.197782655148702e-09, 2.478396541480230e+00 }, - { -2.209592028154913e-09, 2.488179120439544e+00 }, - { -2.221318211270522e-09, 2.497917378500214e+00 }, - { -2.232960763010574e-09, 2.507610599172123e+00 }, - { -2.244519245040444e-09, 2.517258065817044e+00 }, - { -2.255993222189014e-09, 2.526859061686102e+00 }, - { -2.267382262468209e-09, 2.536412869962689e+00 }, - { -2.278685937086658e-09, 2.545918773800664e+00 }, - { -2.289903820467374e-09, 2.555376056366064e+00 }, - { -2.301035490263848e-09, 2.564784000877677e+00 }, - { -2.312080527374447e-09, 2.574141890646339e+00 }, - { -2.323038515960257e-09, 2.583449009117307e+00 }, - { -2.333909043458635e-09, 2.592704639909166e+00 }, - { -2.344691700601153e-09, 2.601908066856634e+00 }, - { -2.355386081425938e-09, 2.611058574048749e+00 }, - { -2.365991783296513e-09, 2.620155445872768e+00 }, - { -2.376508406913500e-09, 2.629197967052127e+00 }, - { -2.386935556332088e-09, 2.638185422689490e+00 }, - { -2.397272838976436e-09, 2.647117098307332e+00 }, - { -2.407519865653114e-09, 2.655992279887846e+00 }, - { -2.417676250567891e-09, 2.664810253915885e+00 }, - { -2.427741611338014e-09, 2.673570307418169e+00 }, - { -2.437715569009093e-09, 2.682271728006635e+00 }, - { -2.447597748066437e-09, 2.690913803917100e+00 }, - { -2.457387776452357e-09, 2.699495824053297e+00 }, - { -2.467085285577292e-09, 2.708017078025636e+00 }, - { -2.476689910335470e-09, 2.716476856194105e+00 }, - { -2.486201289118733e-09, 2.724874449709689e+00 }, - { -2.495619063828443e-09, 2.733209150554255e+00 }, - { -2.504942879891263e-09, 2.741480251583985e+00 }, - { -2.514172386270163e-09, 2.749687046568741e+00 }, - { -2.523307235480146e-09, 2.757828830235740e+00 }, - { -2.532347083598520e-09, 2.765904898308531e+00 }, - { -2.541291590280960e-09, 2.773914547551261e+00 }, - { -2.550140418771202e-09, 2.781857075807392e+00 }, - { -2.558893235915887e-09, 2.789731782043156e+00 }, - { -2.567549712176927e-09, 2.797537966388929e+00 }, - { -2.576109521642196e-09, 2.805274930179221e+00 }, - { -2.584572342040407e-09, 2.812941975996573e+00 }, - { -2.592937854750428e-09, 2.820538407710556e+00 }, - { -2.601205744816134e-09, 2.828063530521908e+00 }, - { -2.609375700955458e-09, 2.835516651001539e+00 }, - { -2.617447415574869e-09, 2.842897077134583e+00 }, - { -2.625420584778350e-09, 2.850204118359573e+00 }, - { -2.633294908380520e-09, 2.857437085611509e+00 }, - { -2.641070089918234e-09, 2.864595291363663e+00 }, - { -2.648745836659391e-09, 2.871678049666939e+00 }, - { -2.656321859617343e-09, 2.878684676194483e+00 }, - { -2.663797873558322e-09, 2.885614488280000e+00 }, - { -2.671173597015318e-09, 2.892466804962122e+00 }, - { -2.678448752295859e-09, 2.899240947023252e+00 }, - { -2.685623065495139e-09, 2.905936237033475e+00 }, - { -2.692696266503800e-09, 2.912551999389617e+00 }, - { -2.699668089019767e-09, 2.919087560358171e+00 }, - { -2.706538270558513e-09, 2.925542248116882e+00 }, - { -2.713306552460767e-09, 2.931915392794031e+00 }, - { -2.719972679905295e-09, 2.938206326512581e+00 }, - { -2.726536401915442e-09, 2.944414383428562e+00 }, - { -2.732997471371516e-09, 2.950538899775061e+00 }, - { -2.739355645017194e-09, 2.956579213900666e+00 }, - { -2.745610683471516e-09, 2.962534666313284e+00 }, - { -2.751762351235315e-09, 2.968404599718795e+00 }, - { -2.757810416701751e-09, 2.974188359063684e+00 }, - { -2.763754652165128e-09, 2.979885291576143e+00 }, - { -2.769594833827588e-09, 2.985494746805227e+00 }, - { -2.775330741810390e-09, 2.991016076664491e+00 }, - { -2.780962160159068e-09, 2.996448635469842e+00 }, - { -2.786488876854607e-09, 3.001791779983262e+00 }, - { -2.791910683818570e-09, 3.007044869450794e+00 }, - { -2.797227376923695e-09, 3.012207265645876e+00 }, - { -2.802438755998943e-09, 3.017278332907412e+00 }, - { -2.807544624838820e-09, 3.022257438182037e+00 }, - { -2.812544791210840e-09, 3.027143951064684e+00 }, - { -2.817439066860792e-09, 3.031937243837070e+00 }, - { -2.822227267522746e-09, 3.036636691510884e+00 }, - { -2.826909212922864e-09, 3.041241671864994e+00 }, - { -2.831484726789317e-09, 3.045751565488710e+00 }, - { -2.835953636855826e-09, 3.050165755818853e+00 }, - { -2.840315774871260e-09, 3.054483629182857e+00 }, - { -2.844570976602957e-09, 3.058704574835744e+00 }, - { -2.848719081844986e-09, 3.062827985002047e+00 }, - { -2.852759934424164e-09, 3.066853254915581e+00 }, - { -2.856693382203833e-09, 3.070779782857041e+00 }, - { -2.860519277092708e-09, 3.074606970196721e+00 }, - { -2.864237475047239e-09, 3.078334221430809e+00 }, - { -2.867847836080156e-09, 3.081960944223928e+00 }, - { -2.871350224262603e-09, 3.085486549445314e+00 }, - { -2.874744507732462e-09, 3.088910451211251e+00 }, - { -2.878030558696270e-09, 3.092232066921130e+00 }, - { -2.881208253436038e-09, 3.095450817298478e+00 }, - { -2.884277472313999e-09, 3.098566126429974e+00 }, - { -2.887238099774968e-09, 3.101577421802070e+00 }, - { -2.890090024353816e-09, 3.104484134342861e+00 }, - { -2.892833138676371e-09, 3.107285698457308e+00 }, - { -2.895467339466766e-09, 3.109981552069083e+00 }, - { -2.897992527547963e-09, 3.112571136655481e+00 }, - { -2.900408607848946e-09, 3.115053897289195e+00 }, - { -2.902715489404992e-09, 3.117429282673042e+00 }, - { -2.904913085363323e-09, 3.119696745180238e+00 }, - { -2.907001312986328e-09, 3.121855740892224e+00 }, - { -2.908980093652563e-09, 3.123905729634218e+00 }, - { -2.910849352862924e-09, 3.125846175016163e+00 }, - { -2.912609020239985e-09, 3.127676544466606e+00 }, - { -2.914259029534118e-09, 3.129396309273659e+00 }, - { -2.915799318622574e-09, 3.131004944618667e+00 }, - { -2.917229829515169e-09, 3.132501929616775e+00 }, - { -2.918550508353347e-09, 3.133886747350606e+00 }, - { -2.919761305414294e-09, 3.135158884909254e+00 }, - { -2.920862175112829e-09, 3.136317833424958e+00 }, - { -2.921853076000972e-09, 3.137363088107359e+00 }, - { -2.922733970772719e-09, 3.138294148283254e+00 }, - { -2.923504826262027e-09, 3.139110517429204e+00 }, - { -2.924165613447473e-09, 3.139811703211207e+00 }, - { -2.924716307449950e-09, 3.140397217517018e+00 }, - { -2.925156887536978e-09, 3.140866576495489e+00 }, - { -2.925487337120335e-09, 3.141219300588825e+00 }, - { -2.925707643758784e-09, 3.141454914570261e+00 }, - { -2.925817799158535e-09, 3.141572947579352e+00 }, - { -2.925817799171455e-09, 3.141572933154836e+00 }, - { -2.925707643798390e-09, 3.141454409272987e+00 }, - { -2.925487337185779e-09, 3.141216918378770e+00 }, - { -2.925156887628892e-09, 3.140860007424112e+00 }, - { -2.924716307568119e-09, 3.140383227898687e+00 }, - { -2.924165613591896e-09, 3.139786135867868e+00 }, - { -2.923504826432903e-09, 3.139068292003385e+00 }, - { -2.922733970969412e-09, 3.138229261619561e+00 }, - { -2.921853076224321e-09, 3.137268614707029e+00 }, - { -2.920862175361976e-09, 3.136185925964038e+00 }, - { -2.919761305690083e-09, 3.134980774833275e+00 }, - { -2.918550508654911e-09, 3.133652745531368e+00 }, - { -2.917229829843137e-09, 3.132201427085629e+00 }, - { -2.915799318976726e-09, 3.130626413363146e+00 }, - { -2.914259029914435e-09, 3.128927303107136e+00 }, - { -2.912609020646661e-09, 3.127103699965947e+00 }, - { -2.910849353295315e-09, 3.125155212527586e+00 }, - { -2.908980094111509e-09, 3.123081454351802e+00 }, - { -2.907001313470937e-09, 3.120882043999591e+00 }, - { -2.904913085874448e-09, 3.118556605068443e+00 }, - { -2.902715489941767e-09, 3.116104766219928e+00 }, - { -2.900408608411958e-09, 3.113526161214776e+00 }, - { -2.897992528137022e-09, 3.110820428940251e+00 }, - { -2.895467340081818e-09, 3.107987213444579e+00 }, - { -2.892833139317615e-09, 3.105026163964191e+00 }, - { -2.890090025020589e-09, 3.101936934956479e+00 }, - { -2.887238100468092e-09, 3.098719186130021e+00 }, - { -2.884277473032614e-09, 3.095372582472161e+00 }, - { -2.881208254180937e-09, 3.091896794282404e+00 }, - { -2.878030559466594e-09, 3.088291497198199e+00 }, - { -2.874744508528832e-09, 3.084556372228054e+00 }, - { -2.871350225084755e-09, 3.080691105776848e+00 }, - { -2.867847836928063e-09, 3.076695389678615e+00 }, - { -2.864237475921086e-09, 3.072568921221621e+00 }, - { -2.860519277991847e-09, 3.068311403179147e+00 }, - { -2.856693383129018e-09, 3.063922543837792e+00 }, - { -2.852759935374575e-09, 3.059402057023109e+00 }, - { -2.848719082821403e-09, 3.054749662130841e+00 }, - { -2.844570977604520e-09, 3.049965084150782e+00 }, - { -2.840315775898525e-09, 3.045048053697736e+00 }, - { -2.835953637908582e-09, 3.039998307034967e+00 }, - { -2.831484727867511e-09, 3.034815586104635e+00 }, - { -2.826909214026628e-09, 3.029499638550941e+00 }, - { -2.822227268651470e-09, 3.024050217748861e+00 }, - { -2.817439068015245e-09, 3.018467082830179e+00 }, - { -2.812544792390175e-09, 3.012749998707001e+00 }, - { -2.807544626043751e-09, 3.006898736100911e+00 }, - { -2.802438757228650e-09, 3.000913071564665e+00 }, - { -2.797227378178760e-09, 2.994792787510961e+00 }, - { -2.791910685098702e-09, 2.988537672233504e+00 }, - { -2.786488878159805e-09, 2.982147519935565e+00 }, - { -2.780962161489413e-09, 2.975622130750641e+00 }, - { -2.775330743165298e-09, 2.968961310769028e+00 }, - { -2.769594835207775e-09, 2.962164872061613e+00 }, - { -2.763754653569747e-09, 2.955232632701135e+00 }, - { -2.757810418131543e-09, 2.948164416789036e+00 }, - { -2.751762352689432e-09, 2.940960054474719e+00 }, - { -2.745610684950541e-09, 2.933619381982341e+00 }, - { -2.739355646520809e-09, 2.926142241629213e+00 }, - { -2.732997472899722e-09, 2.918528481852205e+00 }, - { -2.726536403468318e-09, 2.910777957226018e+00 }, - { -2.719972681482232e-09, 2.902890528487386e+00 }, - { -2.713306554062453e-09, 2.894866062556452e+00 }, - { -2.706538272184154e-09, 2.886704432555728e+00 }, - { -2.699668090670078e-09, 2.878405517834426e+00 }, - { -2.692696268177908e-09, 2.869969203985464e+00 }, - { -2.685623067193599e-09, 2.861395382869544e+00 }, - { -2.678448754018380e-09, 2.852683952631486e+00 }, - { -2.671173598761847e-09, 2.843834817723832e+00 }, - { -2.663797875328991e-09, 2.834847888922988e+00 }, - { -2.656321861411517e-09, 2.825723083350459e+00 }, - { -2.648745838477759e-09, 2.816460324492298e+00 }, - { -2.641070091759922e-09, 2.807059542215146e+00 }, - { -2.633294910246296e-09, 2.797520672788269e+00 }, - { -2.625420586667340e-09, 2.787843658897949e+00 }, - { -2.617447417487602e-09, 2.778028449668942e+00 }, - { -2.609375702891616e-09, 2.768075000678399e+00 }, - { -2.601205746775692e-09, 2.757983273976943e+00 }, - { -2.592937856733464e-09, 2.747753238101915e+00 }, - { -2.584572344046340e-09, 2.737384868096553e+00 }, - { -2.576109523671634e-09, 2.726878145526201e+00 }, - { -2.567549714229129e-09, 2.716233058492422e+00 }, - { -2.558893237991435e-09, 2.705449601651722e+00 }, - { -2.550140420869302e-09, 2.694527776227857e+00 }, - { -2.541291592402089e-09, 2.683467590030445e+00 }, - { -2.532347085742440e-09, 2.672269057466213e+00 }, - { -2.523307237646751e-09, 2.660932199557362e+00 }, - { -2.514172388459584e-09, 2.649457043952206e+00 }, - { -2.504942882102813e-09, 2.637843624941622e+00 }, - { -2.495619066062810e-09, 2.626091983472908e+00 }, - { -2.486201291375123e-09, 2.614202167160335e+00 }, - { -2.476689912614465e-09, 2.602174230302269e+00 }, - { -2.467085287878098e-09, 2.590008233889805e+00 }, - { -2.457387778775451e-09, 2.577704245623143e+00 }, - { -2.447597750411553e-09, 2.565262339920002e+00 }, - { -2.437715571376127e-09, 2.552682597931055e+00 }, - { -2.427741613727123e-09, 2.539965107548168e+00 }, - { -2.417676252978335e-09, 2.527109963417675e+00 }, - { -2.407519868085581e-09, 2.514117266951687e+00 }, - { -2.397272841430131e-09, 2.500987126335739e+00 }, - { -2.386935558807595e-09, 2.487719656543254e+00 }, - { -2.376508409410024e-09, 2.474314979341178e+00 }, - { -2.365991785814531e-09, 2.460773223303822e+00 }, - { -2.355386083965131e-09, 2.447094523817833e+00 }, - { -2.344691703161363e-09, 2.433279023095734e+00 }, - { -2.333909046040126e-09, 2.419326870180582e+00 }, - { -2.323038518562289e-09, 2.405238220956597e+00 }, - { -2.312080529997549e-09, 2.391013238157397e+00 }, - { -2.301035492907384e-09, 2.376652091371587e+00 }, - { -2.289903823131822e-09, 2.362154957053137e+00 }, - { -2.278685939771276e-09, 2.347522018525197e+00 }, - { -2.267382265173420e-09, 2.332753465990296e+00 }, - { -2.255993224914501e-09, 2.317849496533128e+00 }, - { -2.244519247786155e-09, 2.302810314130351e+00 }, - { -2.232960765776561e-09, 2.287636129652823e+00 }, - { -2.221318214056095e-09, 2.272327160873552e+00 }, - { -2.209592030960763e-09, 2.256883632472565e+00 }, - { -2.197782657974034e-09, 2.241305776039511e+00 }, - { -2.185890539712767e-09, 2.225593830081461e+00 }, - { -2.173916123907886e-09, 2.209748040023618e+00 }, - { -2.161859861389976e-09, 2.193768658216360e+00 }, - { -2.149722206070124e-09, 2.177655943935795e+00 }, - { -2.137503614923981e-09, 2.161410163388424e+00 }, - { -2.125204547975352e-09, 2.145031589714984e+00 }, - { -2.112825468276292e-09, 2.128520502989477e+00 }, - { -2.100366841892917e-09, 2.111877190225612e+00 }, - { -2.087829137884807e-09, 2.095101945374541e+00 }, - { -2.075212828290086e-09, 2.078195069329960e+00 }, - { -2.062518388104923e-09, 2.061156869925600e+00 }, - { -2.049746295268559e-09, 2.043987661939897e+00 }, - { -2.036897030642658e-09, 2.026687767092888e+00 }, - { -2.023971077994576e-09, 2.009257514048162e+00 }, - { -2.010968923979840e-09, 1.991697238413571e+00 }, - { -1.997891058121344e-09, 1.974007282737320e+00 }, - { -1.984737972794098e-09, 1.956187996511354e+00 }, - { -1.971510163203686e-09, 1.938239736166060e+00 }, - { -1.958208127370276e-09, 1.920162865072273e+00 }, - { -1.944832366107339e-09, 1.901957753535934e+00 }, - { -1.931383383005451e-09, 1.883624778799427e+00 }, - { -1.917861684410531e-09, 1.865164325035177e+00 }, - { -1.904267779407432e-09, 1.846576783346324e+00 }, - { -1.890602179798714e-09, 1.827862551760622e+00 }, - { -1.876865400086483e-09, 1.809022035228338e+00 }, - { -1.863057957452539e-09, 1.790055645617624e+00 }, - { -1.849180371740008e-09, 1.770963801711725e+00 }, - { -1.835233165431475e-09, 1.751746929201178e+00 }, - { -1.821216863631569e-09, 1.732405460681919e+00 }, - { -1.807131994045840e-09, 1.712939835648088e+00 }, - { -1.792979086962494e-09, 1.693350500488565e+00 }, - { -1.778758675229683e-09, 1.673637908477153e+00 }, - { -1.764471294238191e-09, 1.653802519770021e+00 }, - { -1.750117481899733e-09, 1.633844801396848e+00 }, - { -1.735697778626995e-09, 1.613765227254186e+00 }, - { -1.721212727314574e-09, 1.593564278099856e+00 }, - { -1.706662873315474e-09, 1.573242441540939e+00 }, - { -1.692048764423848e-09, 1.552800212030258e+00 }, - { -1.677370950852395e-09, 1.532238090855187e+00 }, - { -1.662629985213192e-09, 1.511556586131055e+00 }, - { -1.647826422494560e-09, 1.490756212788764e+00 }, - { -1.632960820042537e-09, 1.469837492568651e+00 }, - { -1.618033737538645e-09, 1.448800954008929e+00 }, - { -1.603045736978760e-09, 1.427647132435469e+00 }, - { -1.587997382653428e-09, 1.406376569953373e+00 }, - { -1.572889241124034e-09, 1.384989815432507e+00 }, - { -1.557721881203696e-09, 1.363487424499449e+00 }, - { -1.542495873934815e-09, 1.341869959524515e+00 }, - { -1.527211792568486e-09, 1.320137989611176e+00 }, - { -1.511870212541253e-09, 1.298292090581491e+00 }, - { -1.496471711454994e-09, 1.276332844965754e+00 }, - { -1.481016869054634e-09, 1.254260841988828e+00 }, - { -1.465506267206068e-09, 1.232076677556547e+00 }, - { -1.449940489875303e-09, 1.209780954243628e+00 }, - { -1.434320123104372e-09, 1.187374281276747e+00 }, - { -1.418645754991533e-09, 1.164857274523495e+00 }, - { -1.402917975667710e-09, 1.142230556475749e+00 }, - { -1.387137377275425e-09, 1.119494756236361e+00 }, - { -1.371304553944712e-09, 1.096650509501278e+00 }, - { -1.355420101772623e-09, 1.073698458546610e+00 }, - { -1.339484618799891e-09, 1.050639252211352e+00 }, - { -1.323498704988051e-09, 1.027473545880543e+00 }, - { -1.307462962198534e-09, 1.004202001471034e+00 }, - { -1.291377994167204e-09, 9.808252874104182e-01 }, - { -1.275244406484394e-09, 9.573440786237052e-01 }, - { -1.259062806570190e-09, 9.337590565128454e-01 }, - { -1.242833803653464e-09, 9.100709089414796e-01 }, - { -1.226558008746195e-09, 8.862803302125812e-01 }, - { -1.210236034623253e-09, 8.623880210538113e-01 }, - { -1.193868495797618e-09, 8.383946885959868e-01 }, - { -1.177456008497777e-09, 8.143010463544786e-01 }, - { -1.160999190645010e-09, 7.901078142102129e-01 }, - { -1.144498661828833e-09, 7.658157183877095e-01 }, - { -1.127955043284965e-09, 7.414254914366063e-01 }, - { -1.111368957870986e-09, 7.169378722095157e-01 }, - { -1.094741030044308e-09, 6.923536058430697e-01 }, - { -1.078071885836393e-09, 6.676734437331688e-01 }, - { -1.061362152831423e-09, 6.428981435165511e-01 }, - { -1.044612460141255e-09, 6.180284690466404e-01 }, - { -1.027823438382183e-09, 5.930651903718045e-01 }, - { -1.010995719652015e-09, 5.680090837138436e-01 }, - { -9.941299375042378e-10, 5.428609314418970e-01 }, - { -9.772267269262058e-10, 5.176215220520872e-01 }, - { -9.602867243141016e-10, 4.922916501421032e-01 }, - { -9.433105674499058e-10, 4.668721163885412e-01 }, - { -9.262988954758817e-10, 4.413637275202624e-01 }, - { -9.092523488719689e-10, 4.157672962958654e-01 }, - { -8.921715694311144e-10, 3.900836414778084e-01 }, - { -8.750572002347607e-10, 3.643135878065193e-01 }, - { -8.579098856296589e-10, 3.384579659762392e-01 }, - { -8.407302712022458e-10, 3.125176126069478e-01 }, - { -8.235190037551917e-10, 2.864933702193017e-01 }, - { -8.062767312831008e-10, 2.603860872080448e-01 }, - { -7.890041029479477e-10, 2.341966178147619e-01 }, - { -7.717017690542486e-10, 2.079258220999725e-01 }, - { -7.543703810250266e-10, 1.815745659161734e-01 }, - { -7.370105913774597e-10, 1.551437208801425e-01 }, - { -7.196230536974697e-10, 1.286341643433767e-01 }, - { -7.022084226165876e-10, 1.020467793657360e-01 }, - { -6.847673537853251e-10, 7.538245468350446e-02 }, - { -6.673005038502516e-10, 4.864208468284503e-02 }, - { -6.498085304282128e-10, 2.182656936863137e-02 }, - { -6.322920920826137e-10, -5.063185663820913e-03 }, - { -6.147518482969490e-10, -3.202626926150343e-02 }, - { -5.971884594516681e-10, -5.906176474160862e-02 }, - { -5.796025867984469e-10, -8.616874992366363e-02 }, - { -5.619948924353588e-10, -1.133462971605448e-01 }, - { -5.443660392823640e-10, -1.405934733692621e-01 }, - { -5.267166910556339e-10, -1.679093400638023e-01 }, - { -5.090475122431451e-10, -1.952929533862739e-01 }, - { -4.913591680795342e-10, -2.227433641394564e-01 }, - { -4.736523245210571e-10, -2.502596178194491e-01 }, - { -4.559276482202303e-10, -2.778407546490776e-01 }, - { -4.381858065011618e-10, -3.054858096104932e-01 }, - { -4.204274673340870e-10, -3.331938124792702e-01 }, - { -4.026532993105397e-10, -3.609637878577768e-01 }, - { -3.848639716178888e-10, -3.887947552098022e-01 }, - { -3.670601540142443e-10, -4.166857288948674e-01 }, - { -3.492425168032583e-10, -4.446357182029681e-01 }, - { -3.314117308088734e-10, -4.726437273896633e-01 }, - { -3.135684673501752e-10, -5.007087557112619e-01 }, - { -2.957133982159296e-10, -5.288297974607742e-01 }, - { -2.778471956393828e-10, -5.570058420037128e-01 }, - { -2.599705322729564e-10, -5.852358738143247e-01 }, - { -2.420840811628366e-10, -6.135188725122560e-01 }, - { -2.241885157240923e-10, -6.418538128986450e-01 }, - { -2.062845097142585e-10, -6.702396649949099e-01 }, - { -1.883727372093546e-10, -6.986753940779493e-01 }, - { -1.704538725773087e-10, -7.271599607197149e-01 }, - { -1.525285904532877e-10, -7.556923208240308e-01 }, - { -1.345975657140748e-10, -7.842714256651911e-01 }, - { -1.166614734526054e-10, -8.128962219265712e-01 }, - { -9.872098895260891e-11, -8.415656517393372e-01 }, - { -8.077678766314517e-11, -8.702786527215916e-01 }, - { -6.282954517324612e-11, -8.990341580176152e-01 }, - { -4.487993718655790e-11, -9.278310963373758e-01 }, - { -2.692863949561210e-11, -9.566683919968972e-01 }, - { -8.976327956520795e-12, -9.855449649582175e-01 }, - { 8.976321536169872e-12, -1.014459730869357e+00 }, - { 2.692863307547294e-11, -1.043411601105914e+00 }, - { 4.487993076694813e-11, -1.072399482811314e+00 }, - { 6.282953875437751e-11, -1.101422278938424e+00 }, - { 8.077678124517653e-11, -1.130478888291020e+00 }, - { 9.872098253591082e-11, -1.159568205565684e+00 }, - { 1.166614670373367e-10, -1.188689121393192e+00 }, - { 1.345975593005002e-10, -1.217840522381901e+00 }, - { 1.525285840416718e-10, -1.247021291159495e+00 }, - { 1.704538661678104e-10, -1.276230306415868e+00 }, - { 1.883727308022916e-10, -1.305466442946703e+00 }, - { 2.062845033098954e-10, -1.334728571696106e+00 }, - { 2.241885093225349e-10, -1.364015559800721e+00 }, - { 2.420840747645085e-10, -1.393326270633325e+00 }, - { 2.599705258779635e-10, -1.422659563847049e+00 }, - { 2.778471892479898e-10, -1.452014295419243e+00 }, - { 2.957133918284542e-10, -1.481389317696831e+00 }, - { 3.135684609667761e-10, -1.510783479440191e+00 }, - { 3.314117244297624e-10, -1.540195625869043e+00 }, - { 3.492425104288060e-10, -1.569624598707558e+00 }, - { 3.670601476445565e-10, -1.599069236228850e+00 }, - { 3.848639652533361e-10, -1.628528373302631e+00 }, - { 4.026532929512281e-10, -1.658000841439269e+00 }, - { 4.204274609803869e-10, -1.687485468837799e+00 }, - { 4.381858001531792e-10, -1.716981080430596e+00 }, - { 4.559276418782829e-10, -1.746486497931567e+00 }, - { 4.736523181853565e-10, -1.776000539882225e+00 }, - { 4.913591617503452e-10, -1.805522021699094e+00 }, - { 5.090475059206794e-10, -1.835049755721194e+00 }, - { 5.267166847401562e-10, -1.864582551257262e+00 }, - { 5.443660329740862e-10, -1.894119214633676e+00 }, - { 5.619948861345454e-10, -1.923658549242818e+00 }, - { 5.796025805053097e-10, -1.953199355591180e+00 }, - { 5.971884531664190e-10, -1.982740431347091e+00 }, - { 6.147518420199055e-10, -2.012280571390674e+00 }, - { 6.322920858139346e-10, -2.041818567861395e+00 }, - { 6.498085241682158e-10, -2.071353210208005e+00 }, - { 6.673004975990425e-10, -2.100883285238127e+00 }, - { 6.847673475432746e-10, -2.130407577166309e+00 }, - { 7.022084163838545e-10, -2.159924867664933e+00 }, - { 7.196230474743716e-10, -2.189433935913779e+00 }, - { 7.370105851640495e-10, -2.218933558650552e+00 }, - { 7.543703748217808e-10, -2.248422510220072e+00 }, - { 7.717017628611672e-10, -2.277899562625407e+00 }, - { 7.890040967654542e-10, -2.307363485579104e+00 }, - { 8.062767251113011e-10, -2.336813046552684e+00 }, - { 8.235189975944034e-10, -2.366247010829556e+00 }, - { 8.407302650525749e-10, -2.395664141553858e+00 }, - { 8.579098794915287e-10, -2.425063199784153e+00 }, - { 8.750571941082773e-10, -2.454442944543319e+00 }, - { 8.921715633164894e-10, -2.483802132872044e+00 }, - { 9.092523427695200e-10, -2.513139519878584e+00 }, - { 9.262988893857148e-10, -2.542453858792682e+00 }, - { 9.433105613723914e-10, -2.571743901017465e+00 }, - { 9.602867182493987e-10, -2.601008396180870e+00 }, - { 9.772267208744730e-10, -2.630246092190425e+00 }, - { 9.941299314658458e-10, -2.659455735283526e+00 }, - { 1.010995713627070e-09, -2.688636070081818e+00 }, - { 1.027823432371055e-09, -2.717785839644439e+00 }, - { 1.044612454143997e-09, -2.746903785521352e+00 }, - { 1.061362146848353e-09, -2.775988647805256e+00 }, - { 1.078071879867828e-09, -2.805039165187255e+00 }, - { 1.094741024090249e-09, -2.834054075009077e+00 }, - { 1.111368951931856e-09, -2.863032113318052e+00 }, - { 1.127955037360817e-09, -2.891972014920939e+00 }, - { 1.144498655920037e-09, -2.920872513436805e+00 }, - { 1.160999184751779e-09, -2.949732341353290e+00 }, - { 1.177456002620215e-09, -2.978550230079517e+00 }, - { 1.193868489936097e-09, -3.007324910002949e+00 }, - { 1.210236028777826e-09, -3.036055110540183e+00 }, - { 1.226558002917232e-09, -3.064739560196251e+00 }, - { 1.242833797841123e-09, -3.093376986616735e+00 }, - { 1.259062800774685e-09, -3.121966116643377e+00 }, - { 1.275244400705935e-09, -3.150505676371791e+00 }, - { 1.291377988406056e-09, -3.178994391202159e+00 }, - { 1.307462956454857e-09, -3.207430985899192e+00 }, - { 1.323498699262108e-09, -3.235814184645077e+00 }, - { 1.339484613091842e-09, -3.264142711097884e+00 }, - { 1.355420096082785e-09, -3.292415288443373e+00 }, - { 1.371304548273191e-09, -3.320630639454825e+00 }, - { 1.387137371622433e-09, -3.348787486547389e+00 }, - { 1.402917970033511e-09, -3.376884551834256e+00 }, - { 1.418645749376393e-09, -3.404920557184582e+00 }, - { 1.434320117508396e-09, -3.432894224276359e+00 }, - { 1.449940484298756e-09, -3.460804274656981e+00 }, - { 1.465506261649108e-09, -3.488649429796768e+00 }, - { 1.481016863517580e-09, -3.516428411149154e+00 }, - { 1.496471705937951e-09, -3.544139940202303e+00 }, - { 1.511870207044433e-09, -3.571782738540999e+00 }, - { 1.527211787092206e-09, -3.599355527901174e+00 }, - { 1.542495868479076e-09, -3.626857030226671e+00 }, - { 1.557721875768920e-09, -3.654285967729458e+00 }, - { 1.572889235710329e-09, -3.681641062941412e+00 }, - { 1.587997377261005e-09, -3.708921038776707e+00 }, - { 1.603045731607830e-09, -3.736124618586623e+00 }, - { 1.618033732189314e-09, -3.763250526218862e+00 }, - { 1.632960814715177e-09, -3.790297486071938e+00 }, - { 1.647826417189275e-09, -3.817264223155802e+00 }, - { 1.662629979930247e-09, -3.844149463148589e+00 }, - { 1.677370945591844e-09, -3.870951932452996e+00 }, - { 1.692048759186008e-09, -3.897670358257890e+00 }, - { 1.706662868100504e-09, -3.924303468590212e+00 }, - { 1.721212722122685e-09, -3.950849992378278e+00 }, - { 1.735697773458400e-09, -3.977308659506432e+00 }, - { 1.750117476754591e-09, -4.003678200876669e+00 }, - { 1.764471289116712e-09, -4.029957348461003e+00 }, - { 1.778758670132079e-09, -4.056144835364877e+00 }, - { 1.792979081888926e-09, -4.082239395882965e+00 }, - { 1.807131988996465e-09, -4.108239765556996e+00 }, - { 1.821216858606652e-09, -4.134144681236933e+00 }, - { 1.835233160431175e-09, -4.159952881133585e+00 }, - { 1.849180366764537e-09, -4.185663104882633e+00 }, - { 1.863057952502055e-09, -4.211274093599509e+00 }, - { 1.876865395161145e-09, -4.236784589940537e+00 }, - { 1.890602174898734e-09, -4.262193338157148e+00 }, - { 1.904267774533022e-09, -4.287499084158302e+00 }, - { 1.917861679562008e-09, -4.312700575567174e+00 }, - { 1.931383378182392e-09, -4.337796561778708e+00 }, - { 1.944832361310856e-09, -4.362785794021793e+00 }, - { 1.958208122599839e-09, -4.387667025411434e+00 }, - { 1.971510158459931e-09, -4.412439011013396e+00 }, - { 1.984737968076495e-09, -4.437100507898339e+00 }, - { 1.997891053431005e-09, -4.461650275204912e+00 }, - { 2.010968919316289e-09, -4.486087074191693e+00 }, - { 2.023971073358447e-09, -4.510409668301784e+00 }, - { 2.036897026033634e-09, -4.534616823217992e+00 }, - { 2.049746290686799e-09, -4.558707306921882e+00 }, - { 2.062518383551274e-09, -4.582679889754607e+00 }, - { 2.075212823764071e-09, -4.606533344469879e+00 }, - { 2.087829133387063e-09, -4.630266446298172e+00 }, - { 2.100366837422912e-09, -4.653877973001258e+00 }, - { 2.112825463835087e-09, -4.677366704934605e+00 }, - { 2.125204543562522e-09, -4.700731425099899e+00 }, - { 2.137503610540056e-09, -4.723970919208608e+00 }, - { 2.149722201714786e-09, -4.747083975738060e+00 }, - { 2.161859857063438e-09, -4.770069385989595e+00 }, - { 2.173916119610994e-09, -4.792925944149308e+00 }, - { 2.185890535445098e-09, -4.815652447340950e+00 }, - { 2.197782653735957e-09, -4.838247695689436e+00 }, - { 2.209592026751962e-09, -4.860710492376411e+00 }, - { 2.221318209877576e-09, -4.883039643700314e+00 }, - { 2.232960761627846e-09, -4.905233959130168e+00 }, - { 2.244519243667616e-09, -4.927292251368517e+00 }, - { 2.255993220826402e-09, -4.949213336406265e+00 }, - { 2.267382261115285e-09, -4.970996033581527e+00 }, - { 2.278685935744269e-09, -4.992639165639563e+00 }, - { 2.289903819135414e-09, -5.014141558784778e+00 }, - { 2.301035488942000e-09, -5.035502042744443e+00 }, - { 2.312080526062763e-09, -5.056719450823151e+00 }, - { 2.323038514659161e-09, -5.077792619963239e+00 }, - { 2.333909042168180e-09, -5.098720390796817e+00 }, - { 2.344691699320969e-09, -5.119501607709159e+00 }, - { 2.355386080156553e-09, -5.140135118892792e+00 }, - { 2.365991782037187e-09, -5.160619776404897e+00 }, - { 2.376508405665132e-09, -5.180954436227641e+00 }, - { 2.386935555094626e-09, -5.201137958319343e+00 }, - { 2.397272837749508e-09, -5.221169206676762e+00 }, - { 2.407519864436774e-09, -5.241047049389645e+00 }, - { 2.417676249362563e-09, -5.260770358700167e+00 }, - { 2.427741610143750e-09, -5.280338011053974e+00 }, - { 2.437715567825576e-09, -5.299748887163106e+00 }, - { 2.447597746894037e-09, -5.319001872058887e+00 }, - { 2.457387775290440e-09, -5.338095855149190e+00 }, - { 2.467085284426756e-09, -5.357029730277389e+00 }, - { 2.476689909196263e-09, -5.375802395772283e+00 }, - { 2.486201287990485e-09, -5.394412754510426e+00 }, - { 2.495619062711154e-09, -5.412859713968929e+00 }, - { 2.504942878785408e-09, -5.431142186284682e+00 }, - { 2.514172385175743e-09, -5.449259088303476e+00 }, - { 2.523307234396791e-09, -5.467209341642627e+00 }, - { 2.532347082526785e-09, -5.484991872743321e+00 }, - { 2.541291589219998e-09, -5.502605612925014e+00 }, - { 2.550140417722072e-09, -5.520049498445633e+00 }, - { 2.558893234878378e-09, -5.537322470548212e+00 }, - { 2.567549711150773e-09, -5.554423475524196e+00 }, - { 2.576109520627371e-09, -5.571351464763084e+00 }, - { 2.584572341037361e-09, -5.588105394812198e+00 }, - { 2.592937853759161e-09, -5.604684227423386e+00 }, - { 2.601205743836355e-09, -5.621086929615246e+00 }, - { 2.609375699987564e-09, -5.637312473723475e+00 }, - { 2.617447414618146e-09, -5.653359837454964e+00 }, - { 2.625420583833750e-09, -5.669228003945694e+00 }, - { 2.633294907447937e-09, -5.684915961806963e+00 }, - { 2.641070088997271e-09, -5.700422705186584e+00 }, - { 2.648745835750128e-09, -5.715747233817712e+00 }, - { 2.656321858720176e-09, -5.730888553077074e+00 }, - { 2.663797872673252e-09, -5.745845674030161e+00 }, - { 2.671173596142054e-09, -5.760617613492118e+00 }, - { 2.678448751434797e-09, -5.775203394076705e+00 }, - { 2.685623064645538e-09, -5.789602044248679e+00 }, - { 2.692696265666640e-09, -5.803812598380606e+00 }, - { 2.699668088194915e-09, -5.817834096797069e+00 }, - { 2.706538269745573e-09, -5.831665585834668e+00 }, - { 2.713306551659817e-09, -5.845306117889361e+00 }, - { 2.719972679116734e-09, -5.858754751472542e+00 }, - { 2.726536401139295e-09, -5.872010551255358e+00 }, - { 2.732997470607439e-09, -5.885072588127400e+00 }, - { 2.739355644265558e-09, -5.897939939244211e+00 }, - { 2.745610682731633e-09, -5.910611688078208e+00 }, - { 2.751762350508137e-09, -5.923086924473290e+00 }, - { 2.757810415987146e-09, -5.935364744687794e+00 }, - { 2.763754651462700e-09, -5.947444251452243e+00 }, - { 2.769594833137415e-09, -5.959324554015538e+00 }, - { 2.775330741132843e-09, -5.971004768198829e+00 }, - { 2.780962159494174e-09, -5.982484016437981e+00 }, - { 2.786488876202047e-09, -5.993761427840588e+00 }, - { 2.791910683178690e-09, -6.004836138231525e+00 }, - { 2.797227376295779e-09, -6.015707290202086e+00 }, - { 2.802438755383971e-09, -6.026374033162623e+00 }, - { 2.807544624236659e-09, -6.036835523383457e+00 }, - { 2.812544790621093e-09, -6.047090924050914e+00 }, - { 2.817439066283459e-09, -6.057139405311101e+00 }, - { 2.822227266958278e-09, -6.066980144322601e+00 }, - { 2.826909212371261e-09, -6.076612325295799e+00 }, - { 2.831484726250221e-09, -6.086035139548830e+00 }, - { 2.835953636329660e-09, -6.095247785550617e+00 }, - { 2.840315774357203e-09, -6.104249468967751e+00 }, - { 2.844570976102082e-09, -6.113039402715685e+00 }, - { 2.848719081357095e-09, -6.121616806996519e+00 }, - { 2.852759933948860e-09, -6.129980909353977e+00 }, - { 2.856693381741114e-09, -6.138130944714082e+00 }, - { 2.860519276643053e-09, -6.146066155436312e+00 }, - { 2.864237474610633e-09, -6.153785791350256e+00 }, - { 2.867847835656203e-09, -6.161289109809551e+00 }, - { 2.871350223851726e-09, -6.168575375732642e+00 }, - { 2.874744507333867e-09, -6.175643861647406e+00 }, - { 2.878030558310989e-09, -6.182493847739853e+00 }, - { 2.881208253063899e-09, -6.189124621889823e+00 }, - { 2.884277471954592e-09, -6.195535479723423e+00 }, - { 2.887238099428306e-09, -6.201725724651554e+00 }, - { 2.890090024020323e-09, -6.207694667918394e+00 }, - { 2.892833138356060e-09, -6.213441628635915e+00 }, - { 2.895467339159240e-09, -6.218965933835304e+00 }, - { 2.897992527253659e-09, -6.224266918505075e+00 }, - { 2.900408607567016e-09, -6.229343925633495e+00 }, - { 2.902715489136496e-09, -6.234196306254763e+00 }, - { 2.904913085108075e-09, -6.238823419482017e+00 }, - { 2.907001312743911e-09, -6.243224632557377e+00 }, - { 2.908980093422997e-09, -6.247399320887848e+00 }, - { 2.910849352646620e-09, -6.251346868091392e+00 }, - { 2.912609020036956e-09, -6.255066666028537e+00 }, - { 2.914259029343965e-09, -6.258558114851525e+00 }, - { 2.915799318445710e-09, -6.261820623039620e+00 }, - { 2.917229829350759e-09, -6.264853607438842e+00 }, - { 2.918550508202463e-09, -6.267656493305673e+00 }, - { 2.919761305276718e-09, -6.270228714337005e+00 }, - { 2.920862174988150e-09, -6.272569712717951e+00 }, - { 2.921853075889193e-09, -6.274678939154603e+00 }, - { 2.922733970674264e-09, -6.276555852917634e+00 }, - { 2.923504826176907e-09, -6.278199921870962e+00 }, - { 2.924165613375264e-09, -6.279610622518139e+00 }, - { 2.924716307391075e-09, -6.280787440034993e+00 }, - { 2.925156887490598e-09, -6.281729868306345e+00 }, - { 2.925487337087508e-09, -6.282437409966992e+00 }, - { 2.925707643739298e-09, -6.282909576428774e+00 }, - { 2.925817799151970e-09, -6.283145887925411e+00 }, + {2.925817799165007e-09, 7.219194364267018e-09}, + {2.925707643778599e-09, 2.526699001579799e-07}, + {2.925487337153070e-09, 1.191140162167675e-06}, + {2.925156887582842e-09, 3.284585035595589e-06}, + {2.924716307509151e-09, 6.994872605695784e-06}, + {2.924165613519592e-09, 1.278374920658798e-05}, + {2.923504826347475e-09, 2.111280464718590e-05}, + {2.922733970871080e-09, 3.244343744537165e-05}, + {2.921853076112655e-09, 4.723682007436170e-05}, + {2.920862175237416e-09, 6.595386421935634e-05}, + {2.919761305552202e-09, 8.905518605213658e-05}, + {2.918550508504146e-09, 1.170010715193098e-04}, + {2.917229829679050e-09, 1.502514416517192e-04}, + {2.915799318799769e-09, 1.892658178912071e-04}, + {2.914259029724184e-09, 2.345032874456615e-04}, + {2.912609020443340e-09, 2.864224686607020e-04}, + {2.910849353079123e-09, 3.454814764261432e-04}, + {2.908980093882049e-09, 4.121378876027343e-04}, + {2.907001313228646e-09, 4.868487064877691e-04}, + {2.904913085618902e-09, 5.700703303049837e-04}, + {2.902715489673383e-09, 6.622585147355725e-04}, + {2.900408608130373e-09, 7.638683394782519e-04}, + {2.897992527842612e-09, 8.753541738578119e-04}, + {2.895467339774186e-09, 9.971696424604937e-04}, + {2.892833138996999e-09, 1.129767590823255e-03}, + {2.890090024687216e-09, 1.273600051161478e-03}, + {2.887238100121550e-09, 1.429118208142094e-03}, + {2.884277472673313e-09, 1.596772364709564e-03}, + {2.881208253808507e-09, 1.777011907950626e-03}, + {2.878030559081432e-09, 1.970285275029487e-03}, + {2.874744508130554e-09, 2.177039919152579e-03}, + {2.871350224673798e-09, 2.397722275614272e-03}, + {2.867847836504030e-09, 2.632777727878843e-03}, + {2.864237475484149e-09, 2.882650573737405e-03}, + {2.860519277542297e-09, 3.147783991507308e-03}, + {2.856693382666432e-09, 3.428620006328931e-03}, + {2.852759934899389e-09, 3.725599456482154e-03}, + {2.848719082333207e-09, 4.039161959812243e-03}, + {2.844570977103752e-09, 4.369745880190706e-03}, + {2.840315775384800e-09, 4.717788294077374e-03}, + {2.835953637382310e-09, 5.083724957128360e-03}, + {2.831484727328322e-09, 5.467990270896617e-03}, + {2.826909213474759e-09, 5.871017249604038e-03}, + {2.822227268087134e-09, 6.293237486988512e-03}, + {2.817439067438018e-09, 6.735081123237729e-03}, + {2.812544791800534e-09, 7.196976811989608e-03}, + {2.807544625441273e-09, 7.679351687456759e-03}, + {2.802438756613836e-09, 8.182631331563162e-03}, + {2.797227377551135e-09, 8.707239741274575e-03}, + {2.791910684458716e-09, 9.253599295902304e-03}, + {2.786488877507140e-09, 9.822130724578715e-03}, + {2.780962160824228e-09, 1.041325307382490e-02}, + {2.775330742487884e-09, 1.102738367513773e-02}, + {2.769594834517682e-09, 1.166493811278924e-02}, + {2.763754652867477e-09, 1.232633019159818e-02}, + {2.757810417416620e-09, 1.301197190494069e-02}, + {2.751762351962413e-09, 1.372227340270610e-02}, + {2.745610684210923e-09, 1.445764295952962e-02}, + {2.739355645769094e-09, 1.521848694296229e-02}, + {2.732997472135539e-09, 1.600520978188769e-02}, + {2.726536402691907e-09, 1.681821393496225e-02}, + {2.719972680693777e-09, 1.765789985920713e-02}, + {2.713306553261610e-09, 1.852466597868779e-02}, + {2.706538271371373e-09, 1.941890865333146e-02}, + {2.699668089844909e-09, 2.034102214787814e-02}, + {2.692696267340880e-09, 2.129139860085272e-02}, + {2.685623066344263e-09, 2.227042799383416e-02}, + {2.678448753157212e-09, 2.327849812064098e-02}, + {2.671173597888530e-09, 2.431599455681316e-02}, + {2.663797874443630e-09, 2.538330062913108e-02}, + {2.656321860514457e-09, 2.648079738524795e-02}, + {2.648745837568575e-09, 2.760886356354952e-02}, + {2.641070090839117e-09, 2.876787556300114e-02}, + {2.633294909313421e-09, 2.995820741329835e-02}, + {2.625420585722845e-09, 3.118023074495535e-02}, + {2.617447416531143e-09, 3.243431475972608e-02}, + {2.609375701923643e-09, 3.372082620101990e-02}, + {2.601205745795833e-09, 3.504012932452527e-02}, + {2.592937855741933e-09, 3.639258586895711e-02}, + {2.584572343043400e-09, 3.777855502693250e-02}, + {2.576109522656942e-09, 3.919839341605197e-02}, + {2.567549713203028e-09, 4.065245505002102e-02}, + {2.558893236953688e-09, 4.214109131001403e-02}, + {2.550140419820252e-09, 4.366465091617666e-02}, + {2.541291591341445e-09, 4.522347989919473e-02}, + {2.532347084670572e-09, 4.681792157215026e-02}, + {2.523307236563343e-09, 4.844831650239501e-02}, + {2.514172387364900e-09, 5.011500248369893e-02}, + {2.504942880997064e-09, 5.181831450849345e-02}, + {2.495619064945627e-09, 5.355858474024022e-02}, + {2.486201290246928e-09, 5.533614248606705e-02}, + {2.476689911475047e-09, 5.715131416942842e-02}, + {2.467085286727668e-09, 5.900442330315692e-02}, + {2.457387777613798e-09, 6.089579046229943e-02}, + {2.447597749239101e-09, 6.282573325755320e-02}, + {2.437715570192557e-09, 6.479456630859221e-02}, + {2.427741612532542e-09, 6.680260121764925e-02}, + {2.417676251773166e-09, 6.885014654319160e-02}, + {2.407519866869294e-09, 7.093750777401114e-02}, + {2.397272840203310e-09, 7.306498730310884e-02}, + {2.386935557569868e-09, 7.523288440214027e-02}, + {2.376508408161815e-09, 7.744149519577415e-02}, + {2.365991784555363e-09, 7.969111263635709e-02}, + {2.355386082695641e-09, 8.198202647865405e-02}, + {2.344691701881232e-09, 8.431452325495814e-02}, + {2.333909044749407e-09, 8.668888625021409e-02}, + {2.323038517261246e-09, 8.910539547731611e-02}, + {2.312080528685971e-09, 9.156432765274414e-02}, + {2.301035491585642e-09, 9.406595617227698e-02}, + {2.289903821799651e-09, 9.661055108691619e-02}, + {2.278685938428940e-09, 9.919837907903295e-02}, + {2.267382263820762e-09, 1.018297034385580e-01}, + {2.255993223551837e-09, 1.045047840397028e-01}, + {2.244519246413220e-09, 1.072238773174577e-01}, + {2.232960764393620e-09, 1.099872362446146e-01}, + {2.221318212663309e-09, 1.127951103088245e-01}, + {2.209592029557811e-09, 1.156477454898748e-01}, + {2.197782656561395e-09, 1.185453842371912e-01}, + {2.185890538290176e-09, 1.214882654476019e-01}, + {2.173916122475606e-09, 1.244766244431883e-01}, + {2.161859859947797e-09, 1.275106929493488e-01}, + {2.149722204618256e-09, 1.305906990731841e-01}, + {2.137503613462743e-09, 1.337168672820376e-01}, + {2.125204546504321e-09, 1.368894183821595e-01}, + {2.112825466795944e-09, 1.401085694976751e-01}, + {2.100366840402933e-09, 1.433745340497602e-01}, + {2.087829136385612e-09, 1.466875217359607e-01}, + {2.075212826781308e-09, 1.500477385098620e-01}, + {2.062518386587093e-09, 1.534553865607503e-01}, + {2.049746293741359e-09, 1.569106642937665e-01}, + {2.036897029106193e-09, 1.604137663100403e-01}, + {2.023971076449323e-09, 1.639648833871233e-01}, + {2.010968922425217e-09, 1.675642024598467e-01}, + {1.997891056557933e-09, 1.712119066008896e-01}, + {1.984737971221581e-09, 1.749081750021970e-01}, + {1.971510161622434e-09, 1.786531829561379e-01}, + {1.958208125780130e-09, 1.824471018371070e-01}, + {1.944832364508511e-09, 1.862900990834311e-01}, + {1.931383381397782e-09, 1.901823381790926e-01}, + {1.917861682794392e-09, 1.941239786363039e-01}, + {1.904267777782611e-09, 1.981151759777950e-01}, + {1.890602178165317e-09, 2.021560817195309e-01}, + {1.876865398444616e-09, 2.062468433536743e-01}, + {1.863057955802572e-09, 2.103876043317229e-01}, + {1.849180370081465e-09, 2.145785040479915e-01}, + {1.835233163764673e-09, 2.188196778231083e-01}, + {1.821216861956509e-09, 2.231112568880342e-01}, + {1.807131992362945e-09, 2.274533683680190e-01}, + {1.792979085271234e-09, 2.318461352671018e-01}, + {1.778758673530482e-09, 2.362896764525300e-01}, + {1.764471292530943e-09, 2.407841066397789e-01}, + {1.750117480184598e-09, 2.453295363773890e-01}, + {1.735697776904342e-09, 2.499260720324433e-01}, + {1.721212725583874e-09, 2.545738157760434e-01}, + {1.706662871577097e-09, 2.592728655691494e-01}, + {1.692048762677849e-09, 2.640233151485341e-01}, + {1.677370949099090e-09, 2.688252540131204e-01}, + {1.662629983452104e-09, 2.736787674105404e-01}, + {1.647826420726167e-09, 2.785839363237506e-01}, + {1.632960818266680e-09, 2.835408374583758e-01}, + {1.618033735755429e-09, 2.885495432295704e-01}, + {1.603045735188609e-09, 2.936101217498361e-01}, + {1.587997380855918e-09, 2.987226368167127e-01}, + {1.572889239319430e-09, 3.038871479007593e-01}, + {1.557721879392051e-09, 3.091037101339017e-01}, + {1.542495872116447e-09, 3.143723742978435e-01}, + {1.527211790743024e-09, 3.196931868130269e-01}, + {1.511870210708909e-09, 3.250661897274744e-01}, + {1.496471709615926e-09, 3.304914207062036e-01}, + {1.481016867208896e-09, 3.359689130207621e-01}, + {1.465506265353924e-09, 3.414986955389885e-01}, + {1.449940488016384e-09, 3.470807927151147e-01}, + {1.434320121238994e-09, 3.527152245800635e-01}, + {1.418645753119802e-09, 3.584020067320109e-01}, + {1.402917973789838e-09, 3.641411503272979e-01}, + {1.387137375391042e-09, 3.699326620714776e-01}, + {1.371304552054134e-09, 3.757765442106153e-01}, + {1.355420099875958e-09, 3.816727945230153e-01}, + {1.339484616897137e-09, 3.876214063110671e-01}, + {1.323498703079580e-09, 3.936223683933865e-01}, + {1.307462960283922e-09, 3.996756650972121e-01}, + {1.291377992246768e-09, 4.057812762511174e-01}, + {1.275244404558188e-09, 4.119391771778626e-01}, + {1.259062804638585e-09, 4.181493386877248e-01}, + {1.242833801715929e-09, 4.244117270719281e-01}, + {1.226558006803155e-09, 4.307263040962509e-01}, + {1.210236032674760e-09, 4.370930269951803e-01}, + {1.193868493843725e-09, 4.435118484661861e-01}, + {1.177456006538695e-09, 4.499827166641340e-01}, + {1.160999188680582e-09, 4.565055751961679e-01}, + {1.144498659859216e-09, 4.630803631168164e-01}, + {1.127955041310214e-09, 4.697070149232604e-01}, + {1.111368955891417e-09, 4.763854605510119e-01}, + {1.094741028059551e-09, 4.831156253697562e-01}, + {1.078071883846871e-09, 4.898974301794375e-01}, + {1.061362150836978e-09, 4.967307912069362e-01}, + {1.044612458142151e-09, 5.036156201023686e-01}, + {1.027823436378632e-09, 5.105518239364775e-01}, + {1.010995717643647e-09, 5.175393051975563e-01}, + {9.941299354913699e-10, 5.245779617890562e-01}, + {9.772267249089968e-10, 5.316676870274011e-01}, + {9.602867222926046e-10, 5.388083696401416e-01}, + {9.433105654240147e-10, 5.459998937639375e-01}, + {9.262988934458084e-10, 5.532421389435711e-01}, + {9.092523468378193e-10, 5.605349801305876e-01}, + {8.921715673928355e-10, 5.678782876825250e-01}, + {8.750571981926701e-10, 5.752719273622372e-01}, + {8.579098835836508e-10, 5.827157603377209e-01}, + {8.407302691522673e-10, 5.902096431821322e-01}, + {8.235190017016133e-10, 5.977534278737073e-01}, + {8.062767292259225e-10, 6.053469617967722e-01}, + {7.890041008871165e-10, 6.129900877421282e-01}, + {7.717017669898175e-10, 6.206826439083659e-01}, + {7.543703789572603e-10, 6.284244639030392e-01}, + {7.370105893063053e-10, 6.362153767444958e-01}, + {7.196230516231919e-10, 6.440552068636356e-01}, + {7.022084205389746e-10, 6.519437741060674e-01}, + {6.847673517046416e-10, 6.598808937346672e-01}, + {6.673005017664976e-10, 6.678663764322770e-01}, + {6.498085283416530e-10, 6.759000283046127e-01}, + {6.322920899929834e-10, 6.839816508836737e-01}, + {6.147518462045659e-10, 6.921110411311926e-01}, + {5.971884573565851e-10, 7.002879914425926e-01}, + {5.796025847007168e-10, 7.085122896509806e-01}, + {5.619948903351406e-10, 7.167837190315758e-01}, + {5.443660371796048e-10, 7.251020583063744e-01}, + {5.267166889504394e-10, 7.334670816491009e-01}, + {5.090475101356742e-10, 7.418785586903696e-01}, + {4.913591659698399e-10, 7.503362545232619e-01}, + {4.736523224091392e-10, 7.588399297089872e-01}, + {4.559276461062478e-10, 7.673893402829834e-01}, + {4.381858043851147e-10, 7.759842377612828e-01}, + {4.204274652161870e-10, 7.846243691469355e-01}, + {4.026532971908398e-10, 7.933094769370790e-01}, + {3.848639694963359e-10, 8.020392991300200e-01}, + {3.670601518910503e-10, 8.108135692324444e-01}, + {3.492425146784233e-10, 8.196320162675177e-01}, + {3.314117286825031e-10, 8.284943647824689e-01}, + {3.135684652223755e-10, 8.374003348569865e-01}, + {2.957133960867535e-10, 8.463496421118015e-01}, + {2.778471935089361e-10, 8.553419977173513e-01}, + {2.599705301412391e-10, 8.643771084029740e-01}, + {2.420840790301135e-10, 8.734546764660205e-01}, + {2.241885135902046e-10, 8.825743997817682e-01}, + {2.062845075795238e-10, 8.917359718130367e-01}, + {1.883727350736140e-10, 9.009390816205823e-01}, + {1.704538704408269e-10, 9.101834138731877e-01}, + {1.525285883160648e-10, 9.194686488588080e-01}, + {1.345975635762696e-10, 9.287944624950824e-01}, + {1.166614713141648e-10, 9.381605263410157e-01}, + {9.872098681369190e-11, 9.475665076080466e-01}, + {8.077678552380464e-11, 9.570120691722380e-01}, + {6.282954303364090e-11, 9.664968695860140e-01}, + {4.487993504668797e-11, 9.760205630906909e-01}, + {2.692863735553042e-11, 9.855827996289697e-01}, + {8.976325816439114e-12, 9.951832248577780e-01}, + {-8.976323676304494e-12, 1.004821480161519e+00}, + {-2.692863521550168e-11, 1.014497202665280e+00}, + {-4.487993290681805e-11, 1.024210025248670e+00}, + {-6.282954089398273e-11, 1.033959576559617e+00}, + {-8.077678338451706e-11, 1.043745481028715e+00}, + {-9.872098467477489e-11, 1.053567358883467e+00}, + {-1.166614691757772e-10, 1.063424826163223e+00}, + {-1.345975614383584e-10, 1.073317494734013e+00}, + {-1.525285861788948e-10, 1.083244972303963e+00}, + {-1.704538683042922e-10, 1.093206862438572e+00}, + {-1.883727329379793e-10, 1.103202764576806e+00}, + {-2.062845054446831e-10, 1.113232274046796e+00}, + {-2.241885114563697e-10, 1.123294982082432e+00}, + {-2.420840768973375e-10, 1.133390475839767e+00}, + {-2.599705280096278e-10, 1.143518338413855e+00}, + {-2.778471913784365e-10, 1.153678148855860e+00}, + {-2.957133939575774e-10, 1.163869482190458e+00}, + {-3.135684630945758e-10, 1.174091909433296e+00}, + {-3.314117265561857e-10, 1.184344997608959e+00}, + {-3.492425125535882e-10, 1.194628309769018e+00}, + {-3.670601497678034e-10, 1.204941405010466e+00}, + {-3.848639673748360e-10, 1.215283838494269e+00}, + {-4.026532950710339e-10, 1.225655161464298e+00}, + {-4.204274630982869e-10, 1.236054921266445e+00}, + {-4.381858022691734e-10, 1.246482661367958e+00}, + {-4.559276439922654e-10, 1.256937921377146e+00}, + {-4.736523202972214e-10, 1.267420237063216e+00}, + {-4.913591638600925e-10, 1.277929140376502e+00}, + {-5.090475080282032e-10, 1.288464159468706e+00}, + {-5.267166868452449e-10, 1.299024818713528e+00}, + {-5.443660350768455e-10, 1.309610638727845e+00}, + {-5.619948882348695e-10, 1.320221136392390e+00}, + {-5.796025826029868e-10, 1.330855824873457e+00}, + {-5.971884552615020e-10, 1.341514213644420e+00}, + {-6.147518441122357e-10, 1.352195808507556e+00}, + {-6.322920879034590e-10, 1.362900111616144e+00}, + {-6.498085262549874e-10, 1.373626621496939e+00}, + {-6.673004996827436e-10, 1.384374833072571e+00}, + {-6.847673496239581e-10, 1.395144237684605e+00}, + {-7.022084184613616e-10, 1.405934323116231e+00}, + {-7.196230495488082e-10, 1.416744573616104e+00}, + {-7.370105872352039e-10, 1.427574469921397e+00}, + {-7.543703768894941e-10, 1.438423489281758e+00}, + {-7.717017649255453e-10, 1.449291105483472e+00}, + {-7.890040988262324e-10, 1.460176788873383e+00}, + {-8.062767271686383e-10, 1.471080006383765e+00}, + {-8.235189996479819e-10, 1.482000221556656e+00}, + {-8.407302671024475e-10, 1.492936894569018e+00}, + {-8.579098815375368e-10, 1.503889482257845e+00}, + {-8.750571961505266e-10, 1.514857438145604e+00}, + {-8.921715653546624e-10, 1.525840212465756e+00}, + {-9.092523448036167e-10, 1.536837252188703e+00}, + {-9.262988914157881e-10, 1.547848001047890e+00}, + {-9.433105633981766e-10, 1.558871899565883e+00}, + {-9.602867202711075e-10, 1.569908385081254e+00}, + {-9.772267228916820e-10, 1.580956891774897e+00}, + {-9.941299334786078e-10, 1.592016850697478e+00}, + {-1.010995715635332e-09, 1.603087689796053e+00}, + {-1.027823434374870e-09, 1.614168833942028e+00}, + {-1.044612456143047e-09, 1.625259704958335e+00}, + {-1.061362148842745e-09, 1.636359721647526e+00}, + {-1.078071881857297e-09, 1.647468299819543e+00}, + {-1.094741026074900e-09, 1.658584852320419e+00}, + {-1.111368953911690e-09, 1.669708789060341e+00}, + {-1.127955039335462e-09, 1.680839517042381e+00}, + {-1.144498657889600e-09, 1.691976440391624e+00}, + {-1.160999186716154e-09, 1.703118960383971e+00}, + {-1.177456004579561e-09, 1.714266475475616e+00}, + {-1.193868491889832e-09, 1.725418381332405e+00}, + {-1.210236030726319e-09, 1.736574070859850e+00}, + {-1.226558004860220e-09, 1.747732934232508e+00}, + {-1.242833799778447e-09, 1.758894358924547e+00}, + {-1.259062802706714e-09, 1.770057729740021e+00}, + {-1.275244402631982e-09, 1.781222428842935e+00}, + {-1.291377990326492e-09, 1.792387835788660e+00}, + {-1.307462958369363e-09, 1.803553327553897e+00}, + {-1.323498701170897e-09, 1.814718278568759e+00}, + {-1.339484614994490e-09, 1.825882060747428e+00}, + {-1.355420097979292e-09, 1.837044043519582e+00}, + {-1.371304550163662e-09, 1.848203593862598e+00}, + {-1.387137373506711e-09, 1.859360076332671e+00}, + {-1.402917971911754e-09, 1.870512853097495e+00}, + {-1.418645751248018e-09, 1.881661283967967e+00}, + {-1.434320119373722e-09, 1.892804726431080e+00}, + {-1.449940486157623e-09, 1.903942535681972e+00}, + {-1.465506263501516e-09, 1.915074064656886e+00}, + {-1.481016865363264e-09, 1.926198664066737e+00}, + {-1.496471707776859e-09, 1.937315682428795e+00}, + {-1.511870208876724e-09, 1.948424466101625e+00}, + {-1.527211788917509e-09, 1.959524359317042e+00}, + {-1.542495870297867e-09, 1.970614704215133e+00}, + {-1.557721877580406e-09, 1.981694840876775e+00}, + {-1.572889237514880e-09, 1.992764107358707e+00}, + {-1.587997379058514e-09, 2.003821839726753e+00}, + {-1.603045733398246e-09, 2.014867372090665e+00}, + {-1.618033733972424e-09, 2.025900036638798e+00}, + {-1.632960816490822e-09, 2.036919163671778e+00}, + {-1.647826418957721e-09, 2.047924081638631e+00}, + {-1.662629981691070e-09, 2.058914117170269e+00}, + {-1.677370947345626e-09, 2.069888595116115e+00}, + {-1.692048760931849e-09, 2.080846838577820e+00}, + {-1.706662869838827e-09, 2.091788168946183e+00}, + {-1.721212723853279e-09, 2.102711905935372e+00}, + {-1.735697775181424e-09, 2.113617367619504e+00}, + {-1.750117478469621e-09, 2.124503870468520e+00}, + {-1.764471290823748e-09, 2.135370729383332e+00}, + {-1.778758671831281e-09, 2.146217257733207e+00}, + {-1.792979083579974e-09, 2.157042767390815e+00}, + {-1.807131990679890e-09, 2.167846568770014e+00}, + {-1.821216860281448e-09, 2.178627970860822e+00}, + {-1.835233162097977e-09, 2.189386281268046e+00}, + {-1.849180368423027e-09, 2.200120806246095e+00}, + {-1.863057954152340e-09, 2.210830850737588e+00}, + {-1.876865396802907e-09, 2.221515718409926e+00}, + {-1.890602176531920e-09, 2.232174711691990e+00}, + {-1.904267776157843e-09, 2.242807131812679e+00}, + {-1.917861681178094e-09, 2.253412278837029e+00}, + {-1.931383379790273e-09, 2.263989451705295e+00}, + {-1.944832362909578e-09, 2.274537948269257e+00}, + {-1.958208124189984e-09, 2.285057065331676e+00}, + {-1.971510160041235e-09, 2.295546098682665e+00}, + {-1.984737969649064e-09, 2.306004343138794e+00}, + {-1.997891054994522e-09, 2.316431092581699e+00}, + {-2.010968920870647e-09, 2.326825639994779e+00}, + {-2.023971074903858e-09, 2.337187277503834e+00}, + {-2.036897027569834e-09, 2.347515296413520e+00}, + {-2.049746292214264e-09, 2.357808987247877e+00}, + {-2.062518385069210e-09, 2.368067639787542e+00}, + {-2.075212825272584e-09, 2.378290543109652e+00}, + {-2.087829134886364e-09, 2.388476985626922e+00}, + {-2.100366838912949e-09, 2.398626255125417e+00}, + {-2.112825465315542e-09, 2.408737638805759e+00}, + {-2.125204545033289e-09, 2.418810423320288e+00}, + {-2.137503612001452e-09, 2.428843894814472e+00}, + {-2.149722203166389e-09, 2.438837338964302e+00}, + {-2.161859858505829e-09, 2.448790041018174e+00}, + {-2.173916121043380e-09, 2.458701285834241e+00}, + {-2.185890536867478e-09, 2.468570357921585e+00}, + {-2.197782655148702e-09, 2.478396541480230e+00}, + {-2.209592028154913e-09, 2.488179120439544e+00}, + {-2.221318211270522e-09, 2.497917378500214e+00}, + {-2.232960763010574e-09, 2.507610599172123e+00}, + {-2.244519245040444e-09, 2.517258065817044e+00}, + {-2.255993222189014e-09, 2.526859061686102e+00}, + {-2.267382262468209e-09, 2.536412869962689e+00}, + {-2.278685937086658e-09, 2.545918773800664e+00}, + {-2.289903820467374e-09, 2.555376056366064e+00}, + {-2.301035490263848e-09, 2.564784000877677e+00}, + {-2.312080527374447e-09, 2.574141890646339e+00}, + {-2.323038515960257e-09, 2.583449009117307e+00}, + {-2.333909043458635e-09, 2.592704639909166e+00}, + {-2.344691700601153e-09, 2.601908066856634e+00}, + {-2.355386081425938e-09, 2.611058574048749e+00}, + {-2.365991783296513e-09, 2.620155445872768e+00}, + {-2.376508406913500e-09, 2.629197967052127e+00}, + {-2.386935556332088e-09, 2.638185422689490e+00}, + {-2.397272838976436e-09, 2.647117098307332e+00}, + {-2.407519865653114e-09, 2.655992279887846e+00}, + {-2.417676250567891e-09, 2.664810253915885e+00}, + {-2.427741611338014e-09, 2.673570307418169e+00}, + {-2.437715569009093e-09, 2.682271728006635e+00}, + {-2.447597748066437e-09, 2.690913803917100e+00}, + {-2.457387776452357e-09, 2.699495824053297e+00}, + {-2.467085285577292e-09, 2.708017078025636e+00}, + {-2.476689910335470e-09, 2.716476856194105e+00}, + {-2.486201289118733e-09, 2.724874449709689e+00}, + {-2.495619063828443e-09, 2.733209150554255e+00}, + {-2.504942879891263e-09, 2.741480251583985e+00}, + {-2.514172386270163e-09, 2.749687046568741e+00}, + {-2.523307235480146e-09, 2.757828830235740e+00}, + {-2.532347083598520e-09, 2.765904898308531e+00}, + {-2.541291590280960e-09, 2.773914547551261e+00}, + {-2.550140418771202e-09, 2.781857075807392e+00}, + {-2.558893235915887e-09, 2.789731782043156e+00}, + {-2.567549712176927e-09, 2.797537966388929e+00}, + {-2.576109521642196e-09, 2.805274930179221e+00}, + {-2.584572342040407e-09, 2.812941975996573e+00}, + {-2.592937854750428e-09, 2.820538407710556e+00}, + {-2.601205744816134e-09, 2.828063530521908e+00}, + {-2.609375700955458e-09, 2.835516651001539e+00}, + {-2.617447415574869e-09, 2.842897077134583e+00}, + {-2.625420584778350e-09, 2.850204118359573e+00}, + {-2.633294908380520e-09, 2.857437085611509e+00}, + {-2.641070089918234e-09, 2.864595291363663e+00}, + {-2.648745836659391e-09, 2.871678049666939e+00}, + {-2.656321859617343e-09, 2.878684676194483e+00}, + {-2.663797873558322e-09, 2.885614488280000e+00}, + {-2.671173597015318e-09, 2.892466804962122e+00}, + {-2.678448752295859e-09, 2.899240947023252e+00}, + {-2.685623065495139e-09, 2.905936237033475e+00}, + {-2.692696266503800e-09, 2.912551999389617e+00}, + {-2.699668089019767e-09, 2.919087560358171e+00}, + {-2.706538270558513e-09, 2.925542248116882e+00}, + {-2.713306552460767e-09, 2.931915392794031e+00}, + {-2.719972679905295e-09, 2.938206326512581e+00}, + {-2.726536401915442e-09, 2.944414383428562e+00}, + {-2.732997471371516e-09, 2.950538899775061e+00}, + {-2.739355645017194e-09, 2.956579213900666e+00}, + {-2.745610683471516e-09, 2.962534666313284e+00}, + {-2.751762351235315e-09, 2.968404599718795e+00}, + {-2.757810416701751e-09, 2.974188359063684e+00}, + {-2.763754652165128e-09, 2.979885291576143e+00}, + {-2.769594833827588e-09, 2.985494746805227e+00}, + {-2.775330741810390e-09, 2.991016076664491e+00}, + {-2.780962160159068e-09, 2.996448635469842e+00}, + {-2.786488876854607e-09, 3.001791779983262e+00}, + {-2.791910683818570e-09, 3.007044869450794e+00}, + {-2.797227376923695e-09, 3.012207265645876e+00}, + {-2.802438755998943e-09, 3.017278332907412e+00}, + {-2.807544624838820e-09, 3.022257438182037e+00}, + {-2.812544791210840e-09, 3.027143951064684e+00}, + {-2.817439066860792e-09, 3.031937243837070e+00}, + {-2.822227267522746e-09, 3.036636691510884e+00}, + {-2.826909212922864e-09, 3.041241671864994e+00}, + {-2.831484726789317e-09, 3.045751565488710e+00}, + {-2.835953636855826e-09, 3.050165755818853e+00}, + {-2.840315774871260e-09, 3.054483629182857e+00}, + {-2.844570976602957e-09, 3.058704574835744e+00}, + {-2.848719081844986e-09, 3.062827985002047e+00}, + {-2.852759934424164e-09, 3.066853254915581e+00}, + {-2.856693382203833e-09, 3.070779782857041e+00}, + {-2.860519277092708e-09, 3.074606970196721e+00}, + {-2.864237475047239e-09, 3.078334221430809e+00}, + {-2.867847836080156e-09, 3.081960944223928e+00}, + {-2.871350224262603e-09, 3.085486549445314e+00}, + {-2.874744507732462e-09, 3.088910451211251e+00}, + {-2.878030558696270e-09, 3.092232066921130e+00}, + {-2.881208253436038e-09, 3.095450817298478e+00}, + {-2.884277472313999e-09, 3.098566126429974e+00}, + {-2.887238099774968e-09, 3.101577421802070e+00}, + {-2.890090024353816e-09, 3.104484134342861e+00}, + {-2.892833138676371e-09, 3.107285698457308e+00}, + {-2.895467339466766e-09, 3.109981552069083e+00}, + {-2.897992527547963e-09, 3.112571136655481e+00}, + {-2.900408607848946e-09, 3.115053897289195e+00}, + {-2.902715489404992e-09, 3.117429282673042e+00}, + {-2.904913085363323e-09, 3.119696745180238e+00}, + {-2.907001312986328e-09, 3.121855740892224e+00}, + {-2.908980093652563e-09, 3.123905729634218e+00}, + {-2.910849352862924e-09, 3.125846175016163e+00}, + {-2.912609020239985e-09, 3.127676544466606e+00}, + {-2.914259029534118e-09, 3.129396309273659e+00}, + {-2.915799318622574e-09, 3.131004944618667e+00}, + {-2.917229829515169e-09, 3.132501929616775e+00}, + {-2.918550508353347e-09, 3.133886747350606e+00}, + {-2.919761305414294e-09, 3.135158884909254e+00}, + {-2.920862175112829e-09, 3.136317833424958e+00}, + {-2.921853076000972e-09, 3.137363088107359e+00}, + {-2.922733970772719e-09, 3.138294148283254e+00}, + {-2.923504826262027e-09, 3.139110517429204e+00}, + {-2.924165613447473e-09, 3.139811703211207e+00}, + {-2.924716307449950e-09, 3.140397217517018e+00}, + {-2.925156887536978e-09, 3.140866576495489e+00}, + {-2.925487337120335e-09, 3.141219300588825e+00}, + {-2.925707643758784e-09, 3.141454914570261e+00}, + {-2.925817799158535e-09, 3.141572947579352e+00}, + {-2.925817799171455e-09, 3.141572933154836e+00}, + {-2.925707643798390e-09, 3.141454409272987e+00}, + {-2.925487337185779e-09, 3.141216918378770e+00}, + {-2.925156887628892e-09, 3.140860007424112e+00}, + {-2.924716307568119e-09, 3.140383227898687e+00}, + {-2.924165613591896e-09, 3.139786135867868e+00}, + {-2.923504826432903e-09, 3.139068292003385e+00}, + {-2.922733970969412e-09, 3.138229261619561e+00}, + {-2.921853076224321e-09, 3.137268614707029e+00}, + {-2.920862175361976e-09, 3.136185925964038e+00}, + {-2.919761305690083e-09, 3.134980774833275e+00}, + {-2.918550508654911e-09, 3.133652745531368e+00}, + {-2.917229829843137e-09, 3.132201427085629e+00}, + {-2.915799318976726e-09, 3.130626413363146e+00}, + {-2.914259029914435e-09, 3.128927303107136e+00}, + {-2.912609020646661e-09, 3.127103699965947e+00}, + {-2.910849353295315e-09, 3.125155212527586e+00}, + {-2.908980094111509e-09, 3.123081454351802e+00}, + {-2.907001313470937e-09, 3.120882043999591e+00}, + {-2.904913085874448e-09, 3.118556605068443e+00}, + {-2.902715489941767e-09, 3.116104766219928e+00}, + {-2.900408608411958e-09, 3.113526161214776e+00}, + {-2.897992528137022e-09, 3.110820428940251e+00}, + {-2.895467340081818e-09, 3.107987213444579e+00}, + {-2.892833139317615e-09, 3.105026163964191e+00}, + {-2.890090025020589e-09, 3.101936934956479e+00}, + {-2.887238100468092e-09, 3.098719186130021e+00}, + {-2.884277473032614e-09, 3.095372582472161e+00}, + {-2.881208254180937e-09, 3.091896794282404e+00}, + {-2.878030559466594e-09, 3.088291497198199e+00}, + {-2.874744508528832e-09, 3.084556372228054e+00}, + {-2.871350225084755e-09, 3.080691105776848e+00}, + {-2.867847836928063e-09, 3.076695389678615e+00}, + {-2.864237475921086e-09, 3.072568921221621e+00}, + {-2.860519277991847e-09, 3.068311403179147e+00}, + {-2.856693383129018e-09, 3.063922543837792e+00}, + {-2.852759935374575e-09, 3.059402057023109e+00}, + {-2.848719082821403e-09, 3.054749662130841e+00}, + {-2.844570977604520e-09, 3.049965084150782e+00}, + {-2.840315775898525e-09, 3.045048053697736e+00}, + {-2.835953637908582e-09, 3.039998307034967e+00}, + {-2.831484727867511e-09, 3.034815586104635e+00}, + {-2.826909214026628e-09, 3.029499638550941e+00}, + {-2.822227268651470e-09, 3.024050217748861e+00}, + {-2.817439068015245e-09, 3.018467082830179e+00}, + {-2.812544792390175e-09, 3.012749998707001e+00}, + {-2.807544626043751e-09, 3.006898736100911e+00}, + {-2.802438757228650e-09, 3.000913071564665e+00}, + {-2.797227378178760e-09, 2.994792787510961e+00}, + {-2.791910685098702e-09, 2.988537672233504e+00}, + {-2.786488878159805e-09, 2.982147519935565e+00}, + {-2.780962161489413e-09, 2.975622130750641e+00}, + {-2.775330743165298e-09, 2.968961310769028e+00}, + {-2.769594835207775e-09, 2.962164872061613e+00}, + {-2.763754653569747e-09, 2.955232632701135e+00}, + {-2.757810418131543e-09, 2.948164416789036e+00}, + {-2.751762352689432e-09, 2.940960054474719e+00}, + {-2.745610684950541e-09, 2.933619381982341e+00}, + {-2.739355646520809e-09, 2.926142241629213e+00}, + {-2.732997472899722e-09, 2.918528481852205e+00}, + {-2.726536403468318e-09, 2.910777957226018e+00}, + {-2.719972681482232e-09, 2.902890528487386e+00}, + {-2.713306554062453e-09, 2.894866062556452e+00}, + {-2.706538272184154e-09, 2.886704432555728e+00}, + {-2.699668090670078e-09, 2.878405517834426e+00}, + {-2.692696268177908e-09, 2.869969203985464e+00}, + {-2.685623067193599e-09, 2.861395382869544e+00}, + {-2.678448754018380e-09, 2.852683952631486e+00}, + {-2.671173598761847e-09, 2.843834817723832e+00}, + {-2.663797875328991e-09, 2.834847888922988e+00}, + {-2.656321861411517e-09, 2.825723083350459e+00}, + {-2.648745838477759e-09, 2.816460324492298e+00}, + {-2.641070091759922e-09, 2.807059542215146e+00}, + {-2.633294910246296e-09, 2.797520672788269e+00}, + {-2.625420586667340e-09, 2.787843658897949e+00}, + {-2.617447417487602e-09, 2.778028449668942e+00}, + {-2.609375702891616e-09, 2.768075000678399e+00}, + {-2.601205746775692e-09, 2.757983273976943e+00}, + {-2.592937856733464e-09, 2.747753238101915e+00}, + {-2.584572344046340e-09, 2.737384868096553e+00}, + {-2.576109523671634e-09, 2.726878145526201e+00}, + {-2.567549714229129e-09, 2.716233058492422e+00}, + {-2.558893237991435e-09, 2.705449601651722e+00}, + {-2.550140420869302e-09, 2.694527776227857e+00}, + {-2.541291592402089e-09, 2.683467590030445e+00}, + {-2.532347085742440e-09, 2.672269057466213e+00}, + {-2.523307237646751e-09, 2.660932199557362e+00}, + {-2.514172388459584e-09, 2.649457043952206e+00}, + {-2.504942882102813e-09, 2.637843624941622e+00}, + {-2.495619066062810e-09, 2.626091983472908e+00}, + {-2.486201291375123e-09, 2.614202167160335e+00}, + {-2.476689912614465e-09, 2.602174230302269e+00}, + {-2.467085287878098e-09, 2.590008233889805e+00}, + {-2.457387778775451e-09, 2.577704245623143e+00}, + {-2.447597750411553e-09, 2.565262339920002e+00}, + {-2.437715571376127e-09, 2.552682597931055e+00}, + {-2.427741613727123e-09, 2.539965107548168e+00}, + {-2.417676252978335e-09, 2.527109963417675e+00}, + {-2.407519868085581e-09, 2.514117266951687e+00}, + {-2.397272841430131e-09, 2.500987126335739e+00}, + {-2.386935558807595e-09, 2.487719656543254e+00}, + {-2.376508409410024e-09, 2.474314979341178e+00}, + {-2.365991785814531e-09, 2.460773223303822e+00}, + {-2.355386083965131e-09, 2.447094523817833e+00}, + {-2.344691703161363e-09, 2.433279023095734e+00}, + {-2.333909046040126e-09, 2.419326870180582e+00}, + {-2.323038518562289e-09, 2.405238220956597e+00}, + {-2.312080529997549e-09, 2.391013238157397e+00}, + {-2.301035492907384e-09, 2.376652091371587e+00}, + {-2.289903823131822e-09, 2.362154957053137e+00}, + {-2.278685939771276e-09, 2.347522018525197e+00}, + {-2.267382265173420e-09, 2.332753465990296e+00}, + {-2.255993224914501e-09, 2.317849496533128e+00}, + {-2.244519247786155e-09, 2.302810314130351e+00}, + {-2.232960765776561e-09, 2.287636129652823e+00}, + {-2.221318214056095e-09, 2.272327160873552e+00}, + {-2.209592030960763e-09, 2.256883632472565e+00}, + {-2.197782657974034e-09, 2.241305776039511e+00}, + {-2.185890539712767e-09, 2.225593830081461e+00}, + {-2.173916123907886e-09, 2.209748040023618e+00}, + {-2.161859861389976e-09, 2.193768658216360e+00}, + {-2.149722206070124e-09, 2.177655943935795e+00}, + {-2.137503614923981e-09, 2.161410163388424e+00}, + {-2.125204547975352e-09, 2.145031589714984e+00}, + {-2.112825468276292e-09, 2.128520502989477e+00}, + {-2.100366841892917e-09, 2.111877190225612e+00}, + {-2.087829137884807e-09, 2.095101945374541e+00}, + {-2.075212828290086e-09, 2.078195069329960e+00}, + {-2.062518388104923e-09, 2.061156869925600e+00}, + {-2.049746295268559e-09, 2.043987661939897e+00}, + {-2.036897030642658e-09, 2.026687767092888e+00}, + {-2.023971077994576e-09, 2.009257514048162e+00}, + {-2.010968923979840e-09, 1.991697238413571e+00}, + {-1.997891058121344e-09, 1.974007282737320e+00}, + {-1.984737972794098e-09, 1.956187996511354e+00}, + {-1.971510163203686e-09, 1.938239736166060e+00}, + {-1.958208127370276e-09, 1.920162865072273e+00}, + {-1.944832366107339e-09, 1.901957753535934e+00}, + {-1.931383383005451e-09, 1.883624778799427e+00}, + {-1.917861684410531e-09, 1.865164325035177e+00}, + {-1.904267779407432e-09, 1.846576783346324e+00}, + {-1.890602179798714e-09, 1.827862551760622e+00}, + {-1.876865400086483e-09, 1.809022035228338e+00}, + {-1.863057957452539e-09, 1.790055645617624e+00}, + {-1.849180371740008e-09, 1.770963801711725e+00}, + {-1.835233165431475e-09, 1.751746929201178e+00}, + {-1.821216863631569e-09, 1.732405460681919e+00}, + {-1.807131994045840e-09, 1.712939835648088e+00}, + {-1.792979086962494e-09, 1.693350500488565e+00}, + {-1.778758675229683e-09, 1.673637908477153e+00}, + {-1.764471294238191e-09, 1.653802519770021e+00}, + {-1.750117481899733e-09, 1.633844801396848e+00}, + {-1.735697778626995e-09, 1.613765227254186e+00}, + {-1.721212727314574e-09, 1.593564278099856e+00}, + {-1.706662873315474e-09, 1.573242441540939e+00}, + {-1.692048764423848e-09, 1.552800212030258e+00}, + {-1.677370950852395e-09, 1.532238090855187e+00}, + {-1.662629985213192e-09, 1.511556586131055e+00}, + {-1.647826422494560e-09, 1.490756212788764e+00}, + {-1.632960820042537e-09, 1.469837492568651e+00}, + {-1.618033737538645e-09, 1.448800954008929e+00}, + {-1.603045736978760e-09, 1.427647132435469e+00}, + {-1.587997382653428e-09, 1.406376569953373e+00}, + {-1.572889241124034e-09, 1.384989815432507e+00}, + {-1.557721881203696e-09, 1.363487424499449e+00}, + {-1.542495873934815e-09, 1.341869959524515e+00}, + {-1.527211792568486e-09, 1.320137989611176e+00}, + {-1.511870212541253e-09, 1.298292090581491e+00}, + {-1.496471711454994e-09, 1.276332844965754e+00}, + {-1.481016869054634e-09, 1.254260841988828e+00}, + {-1.465506267206068e-09, 1.232076677556547e+00}, + {-1.449940489875303e-09, 1.209780954243628e+00}, + {-1.434320123104372e-09, 1.187374281276747e+00}, + {-1.418645754991533e-09, 1.164857274523495e+00}, + {-1.402917975667710e-09, 1.142230556475749e+00}, + {-1.387137377275425e-09, 1.119494756236361e+00}, + {-1.371304553944712e-09, 1.096650509501278e+00}, + {-1.355420101772623e-09, 1.073698458546610e+00}, + {-1.339484618799891e-09, 1.050639252211352e+00}, + {-1.323498704988051e-09, 1.027473545880543e+00}, + {-1.307462962198534e-09, 1.004202001471034e+00}, + {-1.291377994167204e-09, 9.808252874104182e-01}, + {-1.275244406484394e-09, 9.573440786237052e-01}, + {-1.259062806570190e-09, 9.337590565128454e-01}, + {-1.242833803653464e-09, 9.100709089414796e-01}, + {-1.226558008746195e-09, 8.862803302125812e-01}, + {-1.210236034623253e-09, 8.623880210538113e-01}, + {-1.193868495797618e-09, 8.383946885959868e-01}, + {-1.177456008497777e-09, 8.143010463544786e-01}, + {-1.160999190645010e-09, 7.901078142102129e-01}, + {-1.144498661828833e-09, 7.658157183877095e-01}, + {-1.127955043284965e-09, 7.414254914366063e-01}, + {-1.111368957870986e-09, 7.169378722095157e-01}, + {-1.094741030044308e-09, 6.923536058430697e-01}, + {-1.078071885836393e-09, 6.676734437331688e-01}, + {-1.061362152831423e-09, 6.428981435165511e-01}, + {-1.044612460141255e-09, 6.180284690466404e-01}, + {-1.027823438382183e-09, 5.930651903718045e-01}, + {-1.010995719652015e-09, 5.680090837138436e-01}, + {-9.941299375042378e-10, 5.428609314418970e-01}, + {-9.772267269262058e-10, 5.176215220520872e-01}, + {-9.602867243141016e-10, 4.922916501421032e-01}, + {-9.433105674499058e-10, 4.668721163885412e-01}, + {-9.262988954758817e-10, 4.413637275202624e-01}, + {-9.092523488719689e-10, 4.157672962958654e-01}, + {-8.921715694311144e-10, 3.900836414778084e-01}, + {-8.750572002347607e-10, 3.643135878065193e-01}, + {-8.579098856296589e-10, 3.384579659762392e-01}, + {-8.407302712022458e-10, 3.125176126069478e-01}, + {-8.235190037551917e-10, 2.864933702193017e-01}, + {-8.062767312831008e-10, 2.603860872080448e-01}, + {-7.890041029479477e-10, 2.341966178147619e-01}, + {-7.717017690542486e-10, 2.079258220999725e-01}, + {-7.543703810250266e-10, 1.815745659161734e-01}, + {-7.370105913774597e-10, 1.551437208801425e-01}, + {-7.196230536974697e-10, 1.286341643433767e-01}, + {-7.022084226165876e-10, 1.020467793657360e-01}, + {-6.847673537853251e-10, 7.538245468350446e-02}, + {-6.673005038502516e-10, 4.864208468284503e-02}, + {-6.498085304282128e-10, 2.182656936863137e-02}, + {-6.322920920826137e-10, -5.063185663820913e-03}, + {-6.147518482969490e-10, -3.202626926150343e-02}, + {-5.971884594516681e-10, -5.906176474160862e-02}, + {-5.796025867984469e-10, -8.616874992366363e-02}, + {-5.619948924353588e-10, -1.133462971605448e-01}, + {-5.443660392823640e-10, -1.405934733692621e-01}, + {-5.267166910556339e-10, -1.679093400638023e-01}, + {-5.090475122431451e-10, -1.952929533862739e-01}, + {-4.913591680795342e-10, -2.227433641394564e-01}, + {-4.736523245210571e-10, -2.502596178194491e-01}, + {-4.559276482202303e-10, -2.778407546490776e-01}, + {-4.381858065011618e-10, -3.054858096104932e-01}, + {-4.204274673340870e-10, -3.331938124792702e-01}, + {-4.026532993105397e-10, -3.609637878577768e-01}, + {-3.848639716178888e-10, -3.887947552098022e-01}, + {-3.670601540142443e-10, -4.166857288948674e-01}, + {-3.492425168032583e-10, -4.446357182029681e-01}, + {-3.314117308088734e-10, -4.726437273896633e-01}, + {-3.135684673501752e-10, -5.007087557112619e-01}, + {-2.957133982159296e-10, -5.288297974607742e-01}, + {-2.778471956393828e-10, -5.570058420037128e-01}, + {-2.599705322729564e-10, -5.852358738143247e-01}, + {-2.420840811628366e-10, -6.135188725122560e-01}, + {-2.241885157240923e-10, -6.418538128986450e-01}, + {-2.062845097142585e-10, -6.702396649949099e-01}, + {-1.883727372093546e-10, -6.986753940779493e-01}, + {-1.704538725773087e-10, -7.271599607197149e-01}, + {-1.525285904532877e-10, -7.556923208240308e-01}, + {-1.345975657140748e-10, -7.842714256651911e-01}, + {-1.166614734526054e-10, -8.128962219265712e-01}, + {-9.872098895260891e-11, -8.415656517393372e-01}, + {-8.077678766314517e-11, -8.702786527215916e-01}, + {-6.282954517324612e-11, -8.990341580176152e-01}, + {-4.487993718655790e-11, -9.278310963373758e-01}, + {-2.692863949561210e-11, -9.566683919968972e-01}, + {-8.976327956520795e-12, -9.855449649582175e-01}, + {8.976321536169872e-12, -1.014459730869357e+00}, + {2.692863307547294e-11, -1.043411601105914e+00}, + {4.487993076694813e-11, -1.072399482811314e+00}, + {6.282953875437751e-11, -1.101422278938424e+00}, + {8.077678124517653e-11, -1.130478888291020e+00}, + {9.872098253591082e-11, -1.159568205565684e+00}, + {1.166614670373367e-10, -1.188689121393192e+00}, + {1.345975593005002e-10, -1.217840522381901e+00}, + {1.525285840416718e-10, -1.247021291159495e+00}, + {1.704538661678104e-10, -1.276230306415868e+00}, + {1.883727308022916e-10, -1.305466442946703e+00}, + {2.062845033098954e-10, -1.334728571696106e+00}, + {2.241885093225349e-10, -1.364015559800721e+00}, + {2.420840747645085e-10, -1.393326270633325e+00}, + {2.599705258779635e-10, -1.422659563847049e+00}, + {2.778471892479898e-10, -1.452014295419243e+00}, + {2.957133918284542e-10, -1.481389317696831e+00}, + {3.135684609667761e-10, -1.510783479440191e+00}, + {3.314117244297624e-10, -1.540195625869043e+00}, + {3.492425104288060e-10, -1.569624598707558e+00}, + {3.670601476445565e-10, -1.599069236228850e+00}, + {3.848639652533361e-10, -1.628528373302631e+00}, + {4.026532929512281e-10, -1.658000841439269e+00}, + {4.204274609803869e-10, -1.687485468837799e+00}, + {4.381858001531792e-10, -1.716981080430596e+00}, + {4.559276418782829e-10, -1.746486497931567e+00}, + {4.736523181853565e-10, -1.776000539882225e+00}, + {4.913591617503452e-10, -1.805522021699094e+00}, + {5.090475059206794e-10, -1.835049755721194e+00}, + {5.267166847401562e-10, -1.864582551257262e+00}, + {5.443660329740862e-10, -1.894119214633676e+00}, + {5.619948861345454e-10, -1.923658549242818e+00}, + {5.796025805053097e-10, -1.953199355591180e+00}, + {5.971884531664190e-10, -1.982740431347091e+00}, + {6.147518420199055e-10, -2.012280571390674e+00}, + {6.322920858139346e-10, -2.041818567861395e+00}, + {6.498085241682158e-10, -2.071353210208005e+00}, + {6.673004975990425e-10, -2.100883285238127e+00}, + {6.847673475432746e-10, -2.130407577166309e+00}, + {7.022084163838545e-10, -2.159924867664933e+00}, + {7.196230474743716e-10, -2.189433935913779e+00}, + {7.370105851640495e-10, -2.218933558650552e+00}, + {7.543703748217808e-10, -2.248422510220072e+00}, + {7.717017628611672e-10, -2.277899562625407e+00}, + {7.890040967654542e-10, -2.307363485579104e+00}, + {8.062767251113011e-10, -2.336813046552684e+00}, + {8.235189975944034e-10, -2.366247010829556e+00}, + {8.407302650525749e-10, -2.395664141553858e+00}, + {8.579098794915287e-10, -2.425063199784153e+00}, + {8.750571941082773e-10, -2.454442944543319e+00}, + {8.921715633164894e-10, -2.483802132872044e+00}, + {9.092523427695200e-10, -2.513139519878584e+00}, + {9.262988893857148e-10, -2.542453858792682e+00}, + {9.433105613723914e-10, -2.571743901017465e+00}, + {9.602867182493987e-10, -2.601008396180870e+00}, + {9.772267208744730e-10, -2.630246092190425e+00}, + {9.941299314658458e-10, -2.659455735283526e+00}, + {1.010995713627070e-09, -2.688636070081818e+00}, + {1.027823432371055e-09, -2.717785839644439e+00}, + {1.044612454143997e-09, -2.746903785521352e+00}, + {1.061362146848353e-09, -2.775988647805256e+00}, + {1.078071879867828e-09, -2.805039165187255e+00}, + {1.094741024090249e-09, -2.834054075009077e+00}, + {1.111368951931856e-09, -2.863032113318052e+00}, + {1.127955037360817e-09, -2.891972014920939e+00}, + {1.144498655920037e-09, -2.920872513436805e+00}, + {1.160999184751779e-09, -2.949732341353290e+00}, + {1.177456002620215e-09, -2.978550230079517e+00}, + {1.193868489936097e-09, -3.007324910002949e+00}, + {1.210236028777826e-09, -3.036055110540183e+00}, + {1.226558002917232e-09, -3.064739560196251e+00}, + {1.242833797841123e-09, -3.093376986616735e+00}, + {1.259062800774685e-09, -3.121966116643377e+00}, + {1.275244400705935e-09, -3.150505676371791e+00}, + {1.291377988406056e-09, -3.178994391202159e+00}, + {1.307462956454857e-09, -3.207430985899192e+00}, + {1.323498699262108e-09, -3.235814184645077e+00}, + {1.339484613091842e-09, -3.264142711097884e+00}, + {1.355420096082785e-09, -3.292415288443373e+00}, + {1.371304548273191e-09, -3.320630639454825e+00}, + {1.387137371622433e-09, -3.348787486547389e+00}, + {1.402917970033511e-09, -3.376884551834256e+00}, + {1.418645749376393e-09, -3.404920557184582e+00}, + {1.434320117508396e-09, -3.432894224276359e+00}, + {1.449940484298756e-09, -3.460804274656981e+00}, + {1.465506261649108e-09, -3.488649429796768e+00}, + {1.481016863517580e-09, -3.516428411149154e+00}, + {1.496471705937951e-09, -3.544139940202303e+00}, + {1.511870207044433e-09, -3.571782738540999e+00}, + {1.527211787092206e-09, -3.599355527901174e+00}, + {1.542495868479076e-09, -3.626857030226671e+00}, + {1.557721875768920e-09, -3.654285967729458e+00}, + {1.572889235710329e-09, -3.681641062941412e+00}, + {1.587997377261005e-09, -3.708921038776707e+00}, + {1.603045731607830e-09, -3.736124618586623e+00}, + {1.618033732189314e-09, -3.763250526218862e+00}, + {1.632960814715177e-09, -3.790297486071938e+00}, + {1.647826417189275e-09, -3.817264223155802e+00}, + {1.662629979930247e-09, -3.844149463148589e+00}, + {1.677370945591844e-09, -3.870951932452996e+00}, + {1.692048759186008e-09, -3.897670358257890e+00}, + {1.706662868100504e-09, -3.924303468590212e+00}, + {1.721212722122685e-09, -3.950849992378278e+00}, + {1.735697773458400e-09, -3.977308659506432e+00}, + {1.750117476754591e-09, -4.003678200876669e+00}, + {1.764471289116712e-09, -4.029957348461003e+00}, + {1.778758670132079e-09, -4.056144835364877e+00}, + {1.792979081888926e-09, -4.082239395882965e+00}, + {1.807131988996465e-09, -4.108239765556996e+00}, + {1.821216858606652e-09, -4.134144681236933e+00}, + {1.835233160431175e-09, -4.159952881133585e+00}, + {1.849180366764537e-09, -4.185663104882633e+00}, + {1.863057952502055e-09, -4.211274093599509e+00}, + {1.876865395161145e-09, -4.236784589940537e+00}, + {1.890602174898734e-09, -4.262193338157148e+00}, + {1.904267774533022e-09, -4.287499084158302e+00}, + {1.917861679562008e-09, -4.312700575567174e+00}, + {1.931383378182392e-09, -4.337796561778708e+00}, + {1.944832361310856e-09, -4.362785794021793e+00}, + {1.958208122599839e-09, -4.387667025411434e+00}, + {1.971510158459931e-09, -4.412439011013396e+00}, + {1.984737968076495e-09, -4.437100507898339e+00}, + {1.997891053431005e-09, -4.461650275204912e+00}, + {2.010968919316289e-09, -4.486087074191693e+00}, + {2.023971073358447e-09, -4.510409668301784e+00}, + {2.036897026033634e-09, -4.534616823217992e+00}, + {2.049746290686799e-09, -4.558707306921882e+00}, + {2.062518383551274e-09, -4.582679889754607e+00}, + {2.075212823764071e-09, -4.606533344469879e+00}, + {2.087829133387063e-09, -4.630266446298172e+00}, + {2.100366837422912e-09, -4.653877973001258e+00}, + {2.112825463835087e-09, -4.677366704934605e+00}, + {2.125204543562522e-09, -4.700731425099899e+00}, + {2.137503610540056e-09, -4.723970919208608e+00}, + {2.149722201714786e-09, -4.747083975738060e+00}, + {2.161859857063438e-09, -4.770069385989595e+00}, + {2.173916119610994e-09, -4.792925944149308e+00}, + {2.185890535445098e-09, -4.815652447340950e+00}, + {2.197782653735957e-09, -4.838247695689436e+00}, + {2.209592026751962e-09, -4.860710492376411e+00}, + {2.221318209877576e-09, -4.883039643700314e+00}, + {2.232960761627846e-09, -4.905233959130168e+00}, + {2.244519243667616e-09, -4.927292251368517e+00}, + {2.255993220826402e-09, -4.949213336406265e+00}, + {2.267382261115285e-09, -4.970996033581527e+00}, + {2.278685935744269e-09, -4.992639165639563e+00}, + {2.289903819135414e-09, -5.014141558784778e+00}, + {2.301035488942000e-09, -5.035502042744443e+00}, + {2.312080526062763e-09, -5.056719450823151e+00}, + {2.323038514659161e-09, -5.077792619963239e+00}, + {2.333909042168180e-09, -5.098720390796817e+00}, + {2.344691699320969e-09, -5.119501607709159e+00}, + {2.355386080156553e-09, -5.140135118892792e+00}, + {2.365991782037187e-09, -5.160619776404897e+00}, + {2.376508405665132e-09, -5.180954436227641e+00}, + {2.386935555094626e-09, -5.201137958319343e+00}, + {2.397272837749508e-09, -5.221169206676762e+00}, + {2.407519864436774e-09, -5.241047049389645e+00}, + {2.417676249362563e-09, -5.260770358700167e+00}, + {2.427741610143750e-09, -5.280338011053974e+00}, + {2.437715567825576e-09, -5.299748887163106e+00}, + {2.447597746894037e-09, -5.319001872058887e+00}, + {2.457387775290440e-09, -5.338095855149190e+00}, + {2.467085284426756e-09, -5.357029730277389e+00}, + {2.476689909196263e-09, -5.375802395772283e+00}, + {2.486201287990485e-09, -5.394412754510426e+00}, + {2.495619062711154e-09, -5.412859713968929e+00}, + {2.504942878785408e-09, -5.431142186284682e+00}, + {2.514172385175743e-09, -5.449259088303476e+00}, + {2.523307234396791e-09, -5.467209341642627e+00}, + {2.532347082526785e-09, -5.484991872743321e+00}, + {2.541291589219998e-09, -5.502605612925014e+00}, + {2.550140417722072e-09, -5.520049498445633e+00}, + {2.558893234878378e-09, -5.537322470548212e+00}, + {2.567549711150773e-09, -5.554423475524196e+00}, + {2.576109520627371e-09, -5.571351464763084e+00}, + {2.584572341037361e-09, -5.588105394812198e+00}, + {2.592937853759161e-09, -5.604684227423386e+00}, + {2.601205743836355e-09, -5.621086929615246e+00}, + {2.609375699987564e-09, -5.637312473723475e+00}, + {2.617447414618146e-09, -5.653359837454964e+00}, + {2.625420583833750e-09, -5.669228003945694e+00}, + {2.633294907447937e-09, -5.684915961806963e+00}, + {2.641070088997271e-09, -5.700422705186584e+00}, + {2.648745835750128e-09, -5.715747233817712e+00}, + {2.656321858720176e-09, -5.730888553077074e+00}, + {2.663797872673252e-09, -5.745845674030161e+00}, + {2.671173596142054e-09, -5.760617613492118e+00}, + {2.678448751434797e-09, -5.775203394076705e+00}, + {2.685623064645538e-09, -5.789602044248679e+00}, + {2.692696265666640e-09, -5.803812598380606e+00}, + {2.699668088194915e-09, -5.817834096797069e+00}, + {2.706538269745573e-09, -5.831665585834668e+00}, + {2.713306551659817e-09, -5.845306117889361e+00}, + {2.719972679116734e-09, -5.858754751472542e+00}, + {2.726536401139295e-09, -5.872010551255358e+00}, + {2.732997470607439e-09, -5.885072588127400e+00}, + {2.739355644265558e-09, -5.897939939244211e+00}, + {2.745610682731633e-09, -5.910611688078208e+00}, + {2.751762350508137e-09, -5.923086924473290e+00}, + {2.757810415987146e-09, -5.935364744687794e+00}, + {2.763754651462700e-09, -5.947444251452243e+00}, + {2.769594833137415e-09, -5.959324554015538e+00}, + {2.775330741132843e-09, -5.971004768198829e+00}, + {2.780962159494174e-09, -5.982484016437981e+00}, + {2.786488876202047e-09, -5.993761427840588e+00}, + {2.791910683178690e-09, -6.004836138231525e+00}, + {2.797227376295779e-09, -6.015707290202086e+00}, + {2.802438755383971e-09, -6.026374033162623e+00}, + {2.807544624236659e-09, -6.036835523383457e+00}, + {2.812544790621093e-09, -6.047090924050914e+00}, + {2.817439066283459e-09, -6.057139405311101e+00}, + {2.822227266958278e-09, -6.066980144322601e+00}, + {2.826909212371261e-09, -6.076612325295799e+00}, + {2.831484726250221e-09, -6.086035139548830e+00}, + {2.835953636329660e-09, -6.095247785550617e+00}, + {2.840315774357203e-09, -6.104249468967751e+00}, + {2.844570976102082e-09, -6.113039402715685e+00}, + {2.848719081357095e-09, -6.121616806996519e+00}, + {2.852759933948860e-09, -6.129980909353977e+00}, + {2.856693381741114e-09, -6.138130944714082e+00}, + {2.860519276643053e-09, -6.146066155436312e+00}, + {2.864237474610633e-09, -6.153785791350256e+00}, + {2.867847835656203e-09, -6.161289109809551e+00}, + {2.871350223851726e-09, -6.168575375732642e+00}, + {2.874744507333867e-09, -6.175643861647406e+00}, + {2.878030558310989e-09, -6.182493847739853e+00}, + {2.881208253063899e-09, -6.189124621889823e+00}, + {2.884277471954592e-09, -6.195535479723423e+00}, + {2.887238099428306e-09, -6.201725724651554e+00}, + {2.890090024020323e-09, -6.207694667918394e+00}, + {2.892833138356060e-09, -6.213441628635915e+00}, + {2.895467339159240e-09, -6.218965933835304e+00}, + {2.897992527253659e-09, -6.224266918505075e+00}, + {2.900408607567016e-09, -6.229343925633495e+00}, + {2.902715489136496e-09, -6.234196306254763e+00}, + {2.904913085108075e-09, -6.238823419482017e+00}, + {2.907001312743911e-09, -6.243224632557377e+00}, + {2.908980093422997e-09, -6.247399320887848e+00}, + {2.910849352646620e-09, -6.251346868091392e+00}, + {2.912609020036956e-09, -6.255066666028537e+00}, + {2.914259029343965e-09, -6.258558114851525e+00}, + {2.915799318445710e-09, -6.261820623039620e+00}, + {2.917229829350759e-09, -6.264853607438842e+00}, + {2.918550508202463e-09, -6.267656493305673e+00}, + {2.919761305276718e-09, -6.270228714337005e+00}, + {2.920862174988150e-09, -6.272569712717951e+00}, + {2.921853075889193e-09, -6.274678939154603e+00}, + {2.922733970674264e-09, -6.276555852917634e+00}, + {2.923504826176907e-09, -6.278199921870962e+00}, + {2.924165613375264e-09, -6.279610622518139e+00}, + {2.924716307391075e-09, -6.280787440034993e+00}, + {2.925156887490598e-09, -6.281729868306345e+00}, + {2.925487337087508e-09, -6.282437409966992e+00}, + {2.925707643739298e-09, -6.282909576428774e+00}, + {2.925817799151970e-09, -6.283145887925411e+00}, }; #endif diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h index f48e84aa1..6f5b25673 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h @@ -30,33 +30,35 @@ static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) { - __m128 yl, yh, tmp1, tmp2; - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + __m128 yl, yh, tmp1, tmp2; + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di } static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y) { - const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - y = _mm_xor_ps(y, conjugator); // conjugate y - return _mm_complexmul_ps(x, y); + const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + y = _mm_xor_ps(y, conjugator); // conjugate y + return _mm_complexmul_ps(x, y); } static inline __m128 -_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +{ + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values } static inline __m128 -_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ - return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); +_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +{ + return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); } #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse_intrinsics.h index 6136efba3..9de170708 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse_intrinsics.h @@ -27,20 +27,22 @@ #include static inline __m128 -_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){ - __m128 iValue, qValue; - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) +{ + __m128 iValue, qValue; + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values } static inline __m128 -_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){ - return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); +_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2) +{ + return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); } #endif /* INCLUDED_VOLK_VOLK_SSE_INTRINSICS_H_ */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h index 3c1c0f817..ffce85d32 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h @@ -45,55 +45,55 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; unsigned int n; float rem_code_phase_chips = -0.234; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif /* LV_HAVE_GENERIC */ - + #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -103,26 +103,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -133,26 +133,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -163,26 +163,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* resu static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -193,26 +193,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* resu static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -223,26 +223,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -253,30 +253,29 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H - +#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h index 0d09df273..3628ccf8c 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h @@ -107,7 +107,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -121,7 +122,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -139,13 +140,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -157,7 +158,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul } } -#endif +#endif #ifdef LV_HAVE_SSE4_1 @@ -173,7 +174,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -187,7 +189,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -205,13 +207,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -240,7 +242,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -254,7 +257,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -275,13 +278,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -310,7 +313,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -324,7 +328,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -345,13 +349,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -379,7 +383,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -394,7 +399,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -412,13 +417,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -428,7 +433,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -456,7 +461,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -471,7 +477,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -489,13 +495,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -505,7 +511,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -531,7 +537,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int32_t local_code_chip_index[4]; int32_t local_code_chip_index_; const int32x4_t zeros = vdupq_n_s32(0); @@ -539,11 +546,12 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); int32x4_t local_code_chip_index_reg, aux_i, negatives, i; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; - __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f}; uint32x4_t igx; reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! float32x4_t n0 = vld1q_f32((float*)vec); int current_correlator_tap; unsigned int n; @@ -553,7 +561,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < neon_iters; n++) + for (n = 0; n < neon_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); @@ -569,7 +577,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c // fmod c = vmulq_f32(aux, reciprocal); - i = vcvtq_s32_f32(c); + i = vcvtq_s32_f32(c); cTrunc = vcvtq_f32_s32(i); base = vmulq_f32(cTrunc, code_length_chips_reg_f); aux = vsubq_f32(aux, base); @@ -581,13 +589,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = vaddq_f32(indexn, fours); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap @@ -605,4 +613,3 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c #endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h index 230401ccb..fbf7e31f1 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h @@ -86,11 +86,11 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(lv_16s unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points; n++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); @@ -131,14 +131,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload const unsigned int ROTATOR_RELOAD = 256; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) { for (j = 0; j < ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; @@ -149,7 +149,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } - /* Regenerate phase */ + /* Regenerate phase */ #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -160,13 +160,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload for (j = 0; j < num_points % ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - lv_16sc_t tmp = tmp16 * in_a[n_vec][ (num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j ]; + lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } @@ -179,9 +179,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; int n_vec; @@ -192,7 +192,8 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -206,11 +207,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -218,62 +221,62 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - a = _mm_unpacklo_epi16( a, a ); + a = _mm_unpacklo_epi16(a, a); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... cacc[n_vec] = _mm_adds_epi16(cacc[n_vec], c); } @@ -290,14 +293,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = cacc[n_vec]; - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -313,7 +315,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -325,7 +327,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -337,245 +339,245 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc //static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) //{ - //lv_16sc_t dotProduct = lv_cmake(0,0); +//lv_16sc_t dotProduct = lv_cmake(0,0); - //const unsigned int sse_iters = num_points / 4; - //const unsigned int ROTATOR_RELOAD = 128; - //int n_vec; - //int i; - //unsigned int number; - //unsigned int j; - //unsigned int n; +//const unsigned int sse_iters = num_points / 4; +//const unsigned int ROTATOR_RELOAD = 128; +//int n_vec; +//int i; +//unsigned int number; +//unsigned int j; +//unsigned int n; - //const int16_t** _in_a = in_a; - //const lv_16sc_t* _in_common = in_common; - //lv_16sc_t* _out = result; +//const int16_t** _in_a = in_a; +//const lv_16sc_t* _in_common = in_common; +//lv_16sc_t* _out = result; - //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - //__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); - //__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); +//__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); +//__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //realcacc[n_vec] = _mm_setzero_si128(); - //imagcacc[n_vec] = _mm_setzero_si128(); - //} +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//realcacc[n_vec] = _mm_setzero_si128(); +//imagcacc[n_vec] = _mm_setzero_si128(); +//} - //__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl; +//__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl; - //mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - //mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); +//mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); +//mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); - //// phase rotation registers - //__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; - //__m128i pc1, pc2; - //__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; - //two_phase_inc[0] = phase_inc * phase_inc; - //two_phase_inc[1] = phase_inc * phase_inc; - //two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - //__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; - //two_phase_acc[0] = (*phase); - //two_phase_acc[1] = (*phase) * phase_inc; - //two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); - //__m128 yl, yh, tmp1, tmp2, tmp3; - //lv_16sc_t tmp16; - //lv_32fc_t tmp32; +//// phase rotation registers +//__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; +//__m128i pc1, pc2; +//__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; +//two_phase_inc[0] = phase_inc * phase_inc; +//two_phase_inc[1] = phase_inc * phase_inc; +//two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); +//__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; +//two_phase_acc[0] = (*phase); +//two_phase_acc[1] = (*phase) * phase_inc; +//two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); +//__m128 yl, yh, tmp1, tmp2, tmp3; +//lv_16sc_t tmp16; +//lv_32fc_t tmp32; - //for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) - //{ - //for (j = 0; j < ROTATOR_RELOAD; j++) - //{ - //// Phase rotation on operand in_common starts here: - ////printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - ////complex 32fc multiplication b=a*two_phase_acc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic +//for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) +//{ +//for (j = 0; j < ROTATOR_RELOAD; j++) +//{ +//// Phase rotation on operand in_common starts here: +////printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); +//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +////complex 32fc multiplication b=a*two_phase_acc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +//pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - ////next two samples - //_in_common += 2; - //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - ////complex 32fc multiplication b=a*two_phase_acc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic +////next two samples +//_in_common += 2; +//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +////complex 32fc multiplication b=a*two_phase_acc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +//pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //// store four rotated in_common samples in the register b - //b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic +//// store four rotated in_common samples in the register b +//b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic - ////next two samples - //_in_common += 2; +////next two samples +//_in_common += 2; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - //c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... +//c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - //c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - //real = _mm_subs_epi16(c, c_sr); +//c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. +//real = _mm_subs_epi16(c, c_sr); - //b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - //a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +//b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +//a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - //imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - //imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +//imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +//imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - //imag = _mm_adds_epi16(imag1, imag2); +//imag = _mm_adds_epi16(imag1, imag2); - //realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); - //imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); - //} - //} - //// regenerate phase - //tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); - //tmp2 = _mm_hadd_ps(tmp1, tmp1); - //tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - //tmp2 = _mm_sqrt_ps(tmp1); - //two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); - //} +//realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); +//imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); +//} +//} +//// regenerate phase +//tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); +//tmp2 = _mm_hadd_ps(tmp1, tmp1); +//tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); +//tmp2 = _mm_sqrt_ps(tmp1); +//two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); +//} - //for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) - //{ - //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - ////complex 32fc multiplication b=a*two_phase_acc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic +//for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) +//{ +//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +////complex 32fc multiplication b=a*two_phase_acc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +//pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - ////next two samples - //_in_common += 2; - //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - ////complex 32fc multiplication b=a*two_phase_acc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic +////next two samples +//_in_common += 2; +//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +////complex 32fc multiplication b=a*two_phase_acc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +//pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //// store four rotated in_common samples in the register b - //b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic +//// store four rotated in_common samples in the register b +//b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic - ////next two samples - //_in_common += 2; +////next two samples +//_in_common += 2; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - //c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... +//c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - //c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - //real = _mm_subs_epi16(c, c_sr); +//c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. +//real = _mm_subs_epi16(c, c_sr); - //b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - //a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +//b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +//a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - //imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - //imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +//imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +//imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - //imag = _mm_adds_epi16(imag1, imag2); +//imag = _mm_adds_epi16(imag1, imag2); - //realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); - //imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); - //} - //} +//realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); +//imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); +//} +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real); - //imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real); +//imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag); - //a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); +//a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - //_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - //dotProduct = lv_cmake(0,0); - //for (i = 0; i < 4; ++i) - //{ - //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); - //} - //_out[n_vec] = dotProduct; - //} +//_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector +//dotProduct = lv_cmake(0,0); +//for (i = 0; i < 4; ++i) +//{ +//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), +//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); +//} +//_out[n_vec] = dotProduct; +//} - //volk_gnsssdr_free(realcacc); - //volk_gnsssdr_free(imagcacc); +//volk_gnsssdr_free(realcacc); +//volk_gnsssdr_free(imagcacc); - //tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); - //tmp2 = _mm_hadd_ps(tmp1, tmp1); - //tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - //tmp2 = _mm_sqrt_ps(tmp1); - //two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); +//tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); +//tmp2 = _mm_hadd_ps(tmp1, tmp1); +//tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); +//tmp2 = _mm_sqrt_ps(tmp1); +//two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); - //_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); - ////(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); - //(*phase) = two_phase_acc[0]; +//_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); +////(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); +//(*phase) = two_phase_acc[0]; - //for(n = sse_iters * 4; n < num_points; n++) - //{ - //tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); - //tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); - //tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); - //(*phase) *= phase_inc; +//for(n = sse_iters * 4; n < num_points; n++) +//{ +//tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); +//tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); +//tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); +//(*phase) *= phase_inc; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; - ////lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); - //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - //sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); - //} - //} +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; +////lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); +//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), +//sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); +//} +//} //} //#endif [> LV_HAVE_SSE3 <] @@ -584,9 +586,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; int n_vec; @@ -597,7 +599,8 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -611,11 +614,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -623,62 +628,62 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - a = _mm_unpacklo_epi16( a, a ); + a = _mm_unpacklo_epi16(a, a); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... cacc[n_vec] = _mm_adds_epi16(cacc[n_vec], c); } @@ -695,14 +700,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = cacc[n_vec]; - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -718,7 +722,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -730,7 +734,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -742,7 +746,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc #include #include -static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int avx2_iters = num_points / 8; const int16_t** _in_a = in_a; @@ -755,8 +759,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc lv_16sc_t tmp16; lv_32fc_t tmp32; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - lv_16sc_t dotProduct = lv_cmake(0,0); + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0, 0); __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -771,7 +776,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc __m256 four_phase_acc_reg, four_phase_inc_reg; - lv_32fc_t _phase_inc = phase_inc*phase_inc*phase_inc*phase_inc; + lv_32fc_t _phase_inc = phase_inc * phase_inc * phase_inc * phase_inc; // Normalise the 4*phase increment #ifdef __cplusplus @@ -780,55 +785,57 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc)); #endif - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; - for( n = 0; n < 4; ++n ) - { - four_phase_inc[n] = _phase_inc; - four_phase_acc[n] = *phase; - *phase *= phase_inc; - } - four_phase_acc_reg = _mm256_load_ps((float*) four_phase_acc); - four_phase_inc_reg = _mm256_load_ps((float*) four_phase_inc); + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_acc[4]; + for (n = 0; n < 4; ++n) + { + four_phase_inc[n] = _phase_inc; + four_phase_acc[n] = *phase; + *phase *= phase_inc; + } + four_phase_acc_reg = _mm256_load_ps((float*)four_phase_acc); + four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); __m256i a2, b2, c, c1, c2, perm_idx; - perm_idx = _mm256_set_epi32( 7, 6, 3, 2, 5, 4, 1, 0); + perm_idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - a128 = _mm_load_si128( (__m128i *)_in_common ); - ai = _mm256_cvtepi16_epi32( a128 ); - a = _mm256_cvtepi32_ps( ai ); + a128 = _mm_load_si128((__m128i*)_in_common); + ai = _mm256_cvtepi16_epi32(a128); + a = _mm256_cvtepi32_ps(ai); //complex 32fc multiplication b=a*two_phase_acc_reg - b = _mm256_complexmul_ps( a, four_phase_acc_reg ); - c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + b = _mm256_complexmul_ps(a, four_phase_acc_reg); + c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); //next four samples _in_common += 4; - a128 = _mm_load_si128( (__m128i *)_in_common ); - ai = _mm256_cvtepi16_epi32( a128 ); - a = _mm256_cvtepi32_ps( ai ); + a128 = _mm_load_si128((__m128i*)_in_common); + ai = _mm256_cvtepi16_epi32(a128); + a = _mm256_cvtepi32_ps(ai); //complex 32fc multiplication b=a*two_phase_acc_reg - b = _mm256_complexmul_ps( a, four_phase_acc_reg ); - c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + b = _mm256_complexmul_ps(a, four_phase_acc_reg); + c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); // Store and convert 32ic to 16ic: - b2 = _mm256_packs_epi32( c1, c2 ); + b2 = _mm256_packs_epi32(c1, c2); - b2 = _mm256_permutevar8x32_epi32( b2, perm_idx ); + b2 = _mm256_permutevar8x32_epi32(b2, perm_idx); _in_common += 4; @@ -836,10 +843,10 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc { ain_128 = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 8])); - ain_128_lo = _mm_unpacklo_epi16( ain_128, ain_128 ); - ain_128_hi = _mm_unpackhi_epi16( ain_128, ain_128 ); + ain_128_lo = _mm_unpacklo_epi16(ain_128, ain_128); + ain_128_hi = _mm_unpackhi_epi16(ain_128, ain_128); - a2 = _mm256_insertf128_si256( _mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); + a2 = _mm256_insertf128_si256(_mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); c = _mm256_mullo_epi16(a2, b2); @@ -856,12 +863,12 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc { a2 = cacc[n_vec]; - _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (number = 0; number < 8; ++number) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } _out[n_vec] = dotProduct; } @@ -872,7 +879,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); (*phase) = four_phase_acc[0]; - for(n = avx2_iters * 8; n < num_points; n++) + for (n = avx2_iters * 8; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -882,10 +889,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } - } #endif /* LV_HAVE_AVX2 */ @@ -894,7 +900,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc #include #include -static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int avx2_iters = num_points / 8; const int16_t** _in_a = in_a; @@ -907,8 +913,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc lv_16sc_t tmp16; lv_32fc_t tmp32; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - lv_16sc_t dotProduct = lv_cmake(0,0); + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0, 0); __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -923,7 +930,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc __m256 four_phase_acc_reg, four_phase_inc_reg; - lv_32fc_t _phase_inc = phase_inc*phase_inc*phase_inc*phase_inc; + lv_32fc_t _phase_inc = phase_inc * phase_inc * phase_inc * phase_inc; // Normalise the 4*phase increment #ifdef __cplusplus @@ -932,55 +939,57 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc)); #endif - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; - for( n = 0; n < 4; ++n ) - { - four_phase_inc[n] = _phase_inc; - four_phase_acc[n] = *phase; - *phase *= phase_inc; - } - four_phase_acc_reg = _mm256_load_ps((float*) four_phase_acc); - four_phase_inc_reg = _mm256_load_ps((float*) four_phase_inc); + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_acc[4]; + for (n = 0; n < 4; ++n) + { + four_phase_inc[n] = _phase_inc; + four_phase_acc[n] = *phase; + *phase *= phase_inc; + } + four_phase_acc_reg = _mm256_load_ps((float*)four_phase_acc); + four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); __m256i a2, b2, c, c1, c2, perm_idx; - perm_idx = _mm256_set_epi32( 7, 6, 3, 2, 5, 4, 1, 0); + perm_idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - a128 = _mm_loadu_si128( (__m128i *)_in_common ); - ai = _mm256_cvtepi16_epi32( a128 ); - a = _mm256_cvtepi32_ps( ai ); + a128 = _mm_loadu_si128((__m128i*)_in_common); + ai = _mm256_cvtepi16_epi32(a128); + a = _mm256_cvtepi32_ps(ai); //complex 32fc multiplication b=a*two_phase_acc_reg - b = _mm256_complexmul_ps( a, four_phase_acc_reg ); - c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + b = _mm256_complexmul_ps(a, four_phase_acc_reg); + c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); //next four samples _in_common += 4; - a128 = _mm_loadu_si128( (__m128i *)_in_common ); - ai = _mm256_cvtepi16_epi32( a128 ); - a = _mm256_cvtepi32_ps( ai ); + a128 = _mm_loadu_si128((__m128i*)_in_common); + ai = _mm256_cvtepi16_epi32(a128); + a = _mm256_cvtepi32_ps(ai); //complex 32fc multiplication b=a*two_phase_acc_reg - b = _mm256_complexmul_ps( a, four_phase_acc_reg ); - c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + b = _mm256_complexmul_ps(a, four_phase_acc_reg); + c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); // Store and convert 32ic to 16ic: - b2 = _mm256_packs_epi32( c1, c2 ); + b2 = _mm256_packs_epi32(c1, c2); - b2 = _mm256_permutevar8x32_epi32( b2, perm_idx ); + b2 = _mm256_permutevar8x32_epi32(b2, perm_idx); _in_common += 4; @@ -988,10 +997,10 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc { ain_128 = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 8])); - ain_128_lo = _mm_unpacklo_epi16( ain_128, ain_128 ); - ain_128_hi = _mm_unpackhi_epi16( ain_128, ain_128 ); + ain_128_lo = _mm_unpacklo_epi16(ain_128, ain_128); + ain_128_hi = _mm_unpackhi_epi16(ain_128, ain_128); - a2 = _mm256_insertf128_si256( _mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); + a2 = _mm256_insertf128_si256(_mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); c = _mm256_mullo_epi16(a2, b2); @@ -1008,12 +1017,12 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc { a2 = cacc[n_vec]; - _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (number = 0; number < 8; ++number) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } _out[n_vec] = dotProduct; } @@ -1024,7 +1033,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); (*phase) = four_phase_acc[0]; - for(n = avx2_iters * 8; n < num_points; n++) + for (n = avx2_iters * 8; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -1034,10 +1043,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } - } #endif /* LV_HAVE_AVX2 */ @@ -1046,178 +1054,178 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc //static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) //{ - //const unsigned int neon_iters = num_points / 4; +//const unsigned int neon_iters = num_points / 4; - //const int16_t** _in_a = in_a; - //const lv_16sc_t* _in_common = in_common; - //lv_16sc_t* _out = result; - //int n_vec; - //int i; - //unsigned int number; - //unsigned int n; - //lv_16sc_t tmp16_, tmp; - //lv_32fc_t tmp32_; +//const int16_t** _in_a = in_a; +//const lv_16sc_t* _in_common = in_common; +//lv_16sc_t* _out = result; +//int n_vec; +//int i; +//unsigned int number; +//unsigned int n; +//lv_16sc_t tmp16_, tmp; +//lv_32fc_t tmp32_; - //if (neon_iters > 0) - //{ - //lv_16sc_t dotProduct = lv_cmake(0,0); - //float arg_phase0 = cargf(*phase); - //float arg_phase_inc = cargf(phase_inc); - //float phase_est; +//if (neon_iters > 0) +//{ +//lv_16sc_t dotProduct = lv_cmake(0,0); +//float arg_phase0 = cargf(*phase); +//float arg_phase_inc = cargf(phase_inc); +//float phase_est; - //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; +//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - //float32x4_t _phase4_real = vld1q_f32(__phase4_real); - //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); +//float32x4_t _phase4_real = vld1q_f32(__phase4_real); +//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; - //lv_32fc_t phase3 = phase2 * phase_inc; - //lv_32fc_t phase4 = phase3 * phase_inc; +//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; +//lv_32fc_t phase3 = phase2 * phase_inc; +//lv_32fc_t phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //float32x4_t _phase_real = vld1q_f32(__phase_real); - //float32x4_t _phase_imag = vld1q_f32(__phase_imag); +//float32x4_t _phase_real = vld1q_f32(__phase_real); +//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - //int16x4x2_t a_val, b_val, c_val; - //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - //float32x4_t half = vdupq_n_f32(0.5f); - //int16x4x2_t tmp16; - //int32x4x2_t tmp32i; +//int16x4x2_t a_val, b_val, c_val; +//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +//float32x4_t half = vdupq_n_f32(0.5f); +//int16x4x2_t tmp16; +//int32x4x2_t tmp32i; - //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; - //float32x4_t sign, PlusHalf, Round; +//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; +//float32x4_t sign, PlusHalf, Round; - //int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); +//int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //accumulator[n_vec].val[0] = vdup_n_s16(0); - //accumulator[n_vec].val[1] = vdup_n_s16(0); - //} +//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//accumulator[n_vec].val[0] = vdup_n_s16(0); +//accumulator[n_vec].val[1] = vdup_n_s16(0); +//} - //for(number = 0; number < neon_iters; number++) - //{ - //[> load 4 complex numbers (int 16 bits each component) <] - //tmp16 = vld2_s16((int16_t*)_in_common); - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //_in_common += 4; +//for(number = 0; number < neon_iters; number++) +//{ +//[> load 4 complex numbers (int 16 bits each component) <] +//tmp16 = vld2_s16((int16_t*)_in_common); +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +//_in_common += 4; - //[> promote them to int 32 bits <] - //tmp32i.val[0] = vmovl_s16(tmp16.val[0]); - //tmp32i.val[1] = vmovl_s16(tmp16.val[1]); +//[> promote them to int 32 bits <] +//tmp32i.val[0] = vmovl_s16(tmp16.val[0]); +//tmp32i.val[1] = vmovl_s16(tmp16.val[1]); - //[> promote them to float 32 bits <] - //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); - //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); +//[> promote them to float 32 bits <] +//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); +//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - //[> complex multiplication of four complex samples (float 32 bits each component) <] - //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); - //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); - //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); - //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); +//[> complex multiplication of four complex samples (float 32 bits each component) <] +//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); +//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); +//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); +//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //[> downcast results to int32 <] - //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[0], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[0] = vcvtq_s32_f32(Round); +//[> downcast results to int32 <] +//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[0], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[0] = vcvtq_s32_f32(Round); - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[1], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[1] = vcvtq_s32_f32(Round); +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[1], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[1] = vcvtq_s32_f32(Round); - //[> downcast results to int16 <] - //tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); - //tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); +//[> downcast results to int16 <] +//tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); +//tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); - //[> compute next four phases <] - //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); - //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); - //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); - //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); +//[> compute next four phases <] +//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); +//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); +//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); +//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - ////__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg +////__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); - //// multiply the real*real and imag*imag to get real result - //// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r - //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); - //// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i - //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[1]); - //c_val.val[0] = vqsub_s16(b_val.val[0], b_val.val[1]); +//// multiply the real*real and imag*imag to get real result +//// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r +//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); +//// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i +//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[1]); +//c_val.val[0] = vqsub_s16(b_val.val[0], b_val.val[1]); - //// Multiply cross terms to get the imaginary result - //// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i - //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[1]); - //// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r - //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); - //c_val.val[1] = vqadd_s16(b_val.val[0], b_val.val[1]); +//// Multiply cross terms to get the imaginary result +//// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i +//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[1]); +//// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r +//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); +//c_val.val[1] = vqadd_s16(b_val.val[0], b_val.val[1]); - //accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], c_val.val[0]); - //accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], c_val.val[1]); - //} - //// Regenerate phase - //if ((number % 256) == 0) - //{ - //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; +//accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], c_val.val[0]); +//accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], c_val.val[1]); +//} +//// Regenerate phase +//if ((number % 256) == 0) +//{ +//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - //*phase = lv_cmake(cos(phase_est), sin(phase_est)); - //phase2 = (lv_32fc_t)(*phase) * phase_inc; - //phase3 = phase2 * phase_inc; - //phase4 = phase3 * phase_inc; +//*phase = lv_cmake(cos(phase_est), sin(phase_est)); +//phase2 = (lv_32fc_t)(*phase) * phase_inc; +//phase3 = phase2 * phase_inc; +//phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //_phase_real = vld1q_f32(____phase_real); - //_phase_imag = vld1q_f32(____phase_imag); - //} - //} +//_phase_real = vld1q_f32(____phase_real); +//_phase_imag = vld1q_f32(____phase_imag); +//} +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - //dotProduct = lv_cmake(0,0); - //for (i = 0; i < 4; ++i) - //{ - //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); - //} - //_out[n_vec] = dotProduct; - //} - //volk_gnsssdr_free(accumulator); - //vst1q_f32((float32_t*)__phase_real, _phase_real); - //vst1q_f32((float32_t*)__phase_imag, _phase_imag); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector +//dotProduct = lv_cmake(0,0); +//for (i = 0; i < 4; ++i) +//{ +//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), +//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); +//} +//_out[n_vec] = dotProduct; +//} +//volk_gnsssdr_free(accumulator); +//vst1q_f32((float32_t*)__phase_real, _phase_real); +//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); - //} +//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); +//} - //for (n = neon_iters * 4; n < num_points; n++) - //{ - //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); - //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); - //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); - //(*phase) *= phase_inc; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //tmp = tmp16_ * in_a[n_vec][n]; - //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); - //} - //} +//for (n = neon_iters * 4; n < num_points; n++) +//{ +//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); +//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); +//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); +//(*phase) *= phase_inc; +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//tmp = tmp16_ * in_a[n_vec][n]; +//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); +//} +//} //} //#endif [> LV_HAVE_NEON <] @@ -1229,186 +1237,186 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc //static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) //{ - //const unsigned int neon_iters = num_points / 4; +//const unsigned int neon_iters = num_points / 4; - //const int16_t** _in_a = in_a; - //const lv_16sc_t* _in_common = in_common; - //lv_16sc_t* _out = result; - //int n_vec; - //int i; - //unsigned int number; - //unsigned int n; - //lv_16sc_t tmp16_, tmp; - //lv_32fc_t tmp32_; +//const int16_t** _in_a = in_a; +//const lv_16sc_t* _in_common = in_common; +//lv_16sc_t* _out = result; +//int n_vec; +//int i; +//unsigned int number; +//unsigned int n; +//lv_16sc_t tmp16_, tmp; +//lv_32fc_t tmp32_; - //if (neon_iters > 0) - //{ - //lv_16sc_t dotProduct = lv_cmake(0,0); - //float arg_phase0 = cargf(*phase); - //float arg_phase_inc = cargf(phase_inc); - //float phase_est; - ////printf("arg phase0: %f", arg_phase0); - //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; +//if (neon_iters > 0) +//{ +//lv_16sc_t dotProduct = lv_cmake(0,0); +//float arg_phase0 = cargf(*phase); +//float arg_phase_inc = cargf(phase_inc); +//float phase_est; +////printf("arg phase0: %f", arg_phase0); +//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - //float32x4_t _phase4_real = vld1q_f32(__phase4_real); - //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); +//float32x4_t _phase4_real = vld1q_f32(__phase4_real); +//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; - //lv_32fc_t phase3 = phase2 * phase_inc; - //lv_32fc_t phase4 = phase3 * phase_inc; +//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; +//lv_32fc_t phase3 = phase2 * phase_inc; +//lv_32fc_t phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //float32x4_t _phase_real = vld1q_f32(__phase_real); - //float32x4_t _phase_imag = vld1q_f32(__phase_imag); +//float32x4_t _phase_real = vld1q_f32(__phase_real); +//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - //int16x4x2_t a_val, b_val; - //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - //float32x4_t half = vdupq_n_f32(0.5f); - //int16x4x2_t tmp16; - //int32x4x2_t tmp32i; +//int16x4x2_t a_val, b_val; +//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +//float32x4_t half = vdupq_n_f32(0.5f); +//int16x4x2_t tmp16; +//int32x4x2_t tmp32i; - //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; - //float32x4_t sign, PlusHalf, Round; +//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; +//float32x4_t sign, PlusHalf, Round; - //int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); +//int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //accumulator[n_vec].val[0] = vdup_n_s16(0); - //accumulator[n_vec].val[1] = vdup_n_s16(0); - //} +//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//accumulator[n_vec].val[0] = vdup_n_s16(0); +//accumulator[n_vec].val[1] = vdup_n_s16(0); +//} - //for(number = 0; number < neon_iters; number++) - //{ - //[> load 4 complex numbers (int 16 bits each component) <] - //tmp16 = vld2_s16((int16_t*)_in_common); - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //_in_common += 4; +//for(number = 0; number < neon_iters; number++) +//{ +//[> load 4 complex numbers (int 16 bits each component) <] +//tmp16 = vld2_s16((int16_t*)_in_common); +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +//_in_common += 4; - //[> promote them to int 32 bits <] - //tmp32i.val[0] = vmovl_s16(tmp16.val[0]); - //tmp32i.val[1] = vmovl_s16(tmp16.val[1]); +//[> promote them to int 32 bits <] +//tmp32i.val[0] = vmovl_s16(tmp16.val[0]); +//tmp32i.val[1] = vmovl_s16(tmp16.val[1]); - //[> promote them to float 32 bits <] - //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); - //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); +//[> promote them to float 32 bits <] +//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); +//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - //[> complex multiplication of four complex samples (float 32 bits each component) <] - //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); - //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); - //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); - //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); +//[> complex multiplication of four complex samples (float 32 bits each component) <] +//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); +//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); +//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); +//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //[> downcast results to int32 <] - //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[0], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[0] = vcvtq_s32_f32(Round); +//[> downcast results to int32 <] +//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[0], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[0] = vcvtq_s32_f32(Round); - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[1], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[1] = vcvtq_s32_f32(Round); +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[1], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[1] = vcvtq_s32_f32(Round); - //[> downcast results to int16 <] - //tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); - //tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); +//[> downcast results to int16 <] +//tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); +//tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); - //[> compute next four phases <] - //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); - //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); - //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); - //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); +//[> compute next four phases <] +//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); +//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); +//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); +//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //// Regenerate phase - //if ((number % 256) == 0) - //{ - ////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); - //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - ////printf("Estimated phase: %f\n\n", cos(phase_est)); +//// Regenerate phase +//if ((number % 256) == 0) +//{ +////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); +//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; +////printf("Estimated phase: %f\n\n", cos(phase_est)); - //*phase = lv_cmake(cos(phase_est), sin(phase_est)); - //phase2 = (lv_32fc_t)(*phase) * phase_inc; - //phase3 = phase2 * phase_inc; - //phase4 = phase3 * phase_inc; +//*phase = lv_cmake(cos(phase_est), sin(phase_est)); +//phase2 = (lv_32fc_t)(*phase) * phase_inc; +//phase3 = phase2 * phase_inc; +//phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //_phase_real = vld1q_f32(____phase_real); - //_phase_imag = vld1q_f32(____phase_imag); +//_phase_real = vld1q_f32(____phase_real); +//_phase_imag = vld1q_f32(____phase_imag); - //// Round = vmulq_f32(_phase_real, _phase_real); - //// Round = vmlaq_f32(Round, _phase_imag, _phase_imag); - //// Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); - ////Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); - ////Round = vrecpeq_f32((Round); - //// _phase_real = vdivq_f32(_phase_real, Round); - //// _phase_imag = vdivq_f32(_phase_imag, Round); - ////_phase_real = vmulq_f32(_phase_real, Round); - ////_phase_imag = vmulq_f32(_phase_imag, Round); - ////printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); +//// Round = vmulq_f32(_phase_real, _phase_real); +//// Round = vmlaq_f32(Round, _phase_imag, _phase_imag); +//// Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); +////Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); +////Round = vrecpeq_f32((Round); +//// _phase_real = vdivq_f32(_phase_real, Round); +//// _phase_imag = vdivq_f32(_phase_imag, Round); +////_phase_real = vmulq_f32(_phase_real, Round); +////_phase_imag = vmulq_f32(_phase_imag, Round); +////printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); - //} +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); - //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); - //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); +//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); +//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); - //// use multiply accumulate/subtract to get result - //b_val.val[0] = vmls_s16(b_val.val[0], a_val.val[1], tmp16.val[1]); - //b_val.val[1] = vmla_s16(b_val.val[1], a_val.val[0], tmp16.val[1]); +//// use multiply accumulate/subtract to get result +//b_val.val[0] = vmls_s16(b_val.val[0], a_val.val[1], tmp16.val[1]); +//b_val.val[1] = vmla_s16(b_val.val[1], a_val.val[0], tmp16.val[1]); - //accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], b_val.val[0]); - //accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], b_val.val[1]); - //} - //} +//accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], b_val.val[0]); +//accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], b_val.val[1]); +//} +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - //dotProduct = lv_cmake(0,0); - //for (i = 0; i < 4; ++i) - //{ - //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); - //} - //_out[n_vec] = dotProduct; - //} - //volk_gnsssdr_free(accumulator); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector +//dotProduct = lv_cmake(0,0); +//for (i = 0; i < 4; ++i) +//{ +//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), +//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); +//} +//_out[n_vec] = dotProduct; +//} +//volk_gnsssdr_free(accumulator); - //vst1q_f32((float32_t*)__phase_real, _phase_real); - //vst1q_f32((float32_t*)__phase_imag, _phase_imag); +//vst1q_f32((float32_t*)__phase_real, _phase_real); +//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); - //} +//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); +//} - //for (n = neon_iters * 4; n < num_points; n++) - //{ - //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); - //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); - //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); - //(*phase) *= phase_inc; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //tmp = tmp16_ * in_a[n_vec][n]; - //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); - //} - //} +//for (n = neon_iters * 4; n < num_points; n++) +//{ +//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); +//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); +//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); +//(*phase) *= phase_inc; +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//tmp = tmp16_ * in_a[n_vec][n]; +//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); +//} +//} //} //#endif [> LV_HAVE_NEON <] @@ -1420,181 +1428,179 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc //static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) //{ - //const unsigned int neon_iters = num_points / 4; +//const unsigned int neon_iters = num_points / 4; - //const int16_t** _in_a = in_a; - //const lv_16sc_t* _in_common = in_common; - //lv_16sc_t* _out = result; - //int n_vec; - //int i; - //unsigned int number; - //unsigned int n; - //lv_16sc_t tmp16_, tmp; - //lv_32fc_t tmp32_; +//const int16_t** _in_a = in_a; +//const lv_16sc_t* _in_common = in_common; +//lv_16sc_t* _out = result; +//int n_vec; +//int i; +//unsigned int number; +//unsigned int n; +//lv_16sc_t tmp16_, tmp; +//lv_32fc_t tmp32_; - //if (neon_iters > 0) - //{ - //lv_16sc_t dotProduct = lv_cmake(0,0); - //float arg_phase0 = cargf(*phase); - //float arg_phase_inc = cargf(phase_inc); - //float phase_est; +//if (neon_iters > 0) +//{ +//lv_16sc_t dotProduct = lv_cmake(0,0); +//float arg_phase0 = cargf(*phase); +//float arg_phase_inc = cargf(phase_inc); +//float phase_est; - //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; +//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - //float32x4_t _phase4_real = vld1q_f32(__phase4_real); - //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); +//float32x4_t _phase4_real = vld1q_f32(__phase4_real); +//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; - //lv_32fc_t phase3 = phase2 * phase_inc; - //lv_32fc_t phase4 = phase3 * phase_inc; +//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; +//lv_32fc_t phase3 = phase2 * phase_inc; +//lv_32fc_t phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //float32x4_t _phase_real = vld1q_f32(__phase_real); - //float32x4_t _phase_imag = vld1q_f32(__phase_imag); +//float32x4_t _phase_real = vld1q_f32(__phase_real); +//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - //int16x4x2_t a_val, b_val; - //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - //float32x4_t half = vdupq_n_f32(0.5f); - //int32x4x2_t tmp32i; +//int16x4x2_t a_val, b_val; +//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +//float32x4_t half = vdupq_n_f32(0.5f); +//int32x4x2_t tmp32i; - //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; - //float32x4_t sign, PlusHalf, Round; +//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; +//float32x4_t sign, PlusHalf, Round; - //int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - //int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); +//int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); +//int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //accumulator1[n_vec].val[0] = vdup_n_s16(0); - //accumulator1[n_vec].val[1] = vdup_n_s16(0); - //accumulator2[n_vec].val[0] = vdup_n_s16(0); - //accumulator2[n_vec].val[1] = vdup_n_s16(0); - //} +//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//accumulator1[n_vec].val[0] = vdup_n_s16(0); +//accumulator1[n_vec].val[1] = vdup_n_s16(0); +//accumulator2[n_vec].val[0] = vdup_n_s16(0); +//accumulator2[n_vec].val[1] = vdup_n_s16(0); +//} - //for(number = 0; number < neon_iters; number++) - //{ - //[> load 4 complex numbers (int 16 bits each component) <] - //b_val = vld2_s16((int16_t*)_in_common); - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //_in_common += 4; +//for(number = 0; number < neon_iters; number++) +//{ +//[> load 4 complex numbers (int 16 bits each component) <] +//b_val = vld2_s16((int16_t*)_in_common); +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +//_in_common += 4; - //[> promote them to int 32 bits <] - //tmp32i.val[0] = vmovl_s16(b_val.val[0]); - //tmp32i.val[1] = vmovl_s16(b_val.val[1]); +//[> promote them to int 32 bits <] +//tmp32i.val[0] = vmovl_s16(b_val.val[0]); +//tmp32i.val[1] = vmovl_s16(b_val.val[1]); - //[> promote them to float 32 bits <] - //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); - //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); +//[> promote them to float 32 bits <] +//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); +//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - //[> complex multiplication of four complex samples (float 32 bits each component) <] - //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); - //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); - //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); - //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); +//[> complex multiplication of four complex samples (float 32 bits each component) <] +//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); +//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); +//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); +//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //[> downcast results to int32 <] - //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[0], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[0] = vcvtq_s32_f32(Round); +//[> downcast results to int32 <] +//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[0], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[0] = vcvtq_s32_f32(Round); - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[1], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[1] = vcvtq_s32_f32(Round); +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[1], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[1] = vcvtq_s32_f32(Round); - //[> downcast results to int16 <] - //b_val.val[0] = vqmovn_s32(tmp32i.val[0]); - //b_val.val[1] = vqmovn_s32(tmp32i.val[1]); +//[> downcast results to int16 <] +//b_val.val[0] = vqmovn_s32(tmp32i.val[0]); +//b_val.val[1] = vqmovn_s32(tmp32i.val[1]); - //[> compute next four phases <] - //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); - //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); - //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); - //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); +//[> compute next four phases <] +//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); +//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); +//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); +//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //// Regenerate phase - //if ((number % 256) == 0) - //{ - ////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); - //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - ////printf("Estimated phase: %f\n\n", cos(phase_est)); +//// Regenerate phase +//if ((number % 256) == 0) +//{ +////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); +//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; +////printf("Estimated phase: %f\n\n", cos(phase_est)); - //*phase = lv_cmake(cos(phase_est), sin(phase_est)); - //phase2 = (lv_32fc_t)(*phase) * phase_inc; - //phase3 = phase2 * phase_inc; - //phase4 = phase3 * phase_inc; +//*phase = lv_cmake(cos(phase_est), sin(phase_est)); +//phase2 = (lv_32fc_t)(*phase) * phase_inc; +//phase3 = phase2 * phase_inc; +//phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //_phase_real = vld1q_f32(____phase_real); - //_phase_imag = vld1q_f32(____phase_imag); - //} +//_phase_real = vld1q_f32(____phase_real); +//_phase_imag = vld1q_f32(____phase_imag); +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); - //// use 2 accumulators to remove inter-instruction data dependencies - //accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); - //accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); - //accumulator2[n_vec].val[0] = vmls_s16(accumulator2[n_vec].val[0], a_val.val[1], b_val.val[1]); - //accumulator2[n_vec].val[1] = vmla_s16(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]); - //} - //} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]); - //accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]); - //} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - //dotProduct = lv_cmake(0,0); - //for (i = 0; i < 4; ++i) - //{ - //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); - //} - //_out[n_vec] = dotProduct; - //} - //volk_gnsssdr_free(accumulator1); - //volk_gnsssdr_free(accumulator2); +//// use 2 accumulators to remove inter-instruction data dependencies +//accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); +//accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); +//accumulator2[n_vec].val[0] = vmls_s16(accumulator2[n_vec].val[0], a_val.val[1], b_val.val[1]); +//accumulator2[n_vec].val[1] = vmla_s16(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]); +//} +//} +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]); +//accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]); +//} +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector +//dotProduct = lv_cmake(0,0); +//for (i = 0; i < 4; ++i) +//{ +//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), +//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); +//} +//_out[n_vec] = dotProduct; +//} +//volk_gnsssdr_free(accumulator1); +//volk_gnsssdr_free(accumulator2); - //vst1q_f32((float32_t*)__phase_real, _phase_real); - //vst1q_f32((float32_t*)__phase_imag, _phase_imag); +//vst1q_f32((float32_t*)__phase_real, _phase_real); +//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); - //} +//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); +//} - //for (n = neon_iters * 4; n < num_points; n++) - //{ - //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); - //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); - //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); - //(*phase) *= phase_inc; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //tmp = tmp16_ * in_a[n_vec][n]; - //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); - //} - //} +//for (n = neon_iters * 4; n < num_points; n++) +//{ +//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); +//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); +//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); +//(*phase) *= phase_inc; +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//tmp = tmp16_ * in_a[n_vec][n]; +//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); +//} +//} //} //#endif [> LV_HAVE_NEON <] #endif /*INCLUDED_volk_gnsssdr_16ic_16i_dot_prod_16ic_xn_H*/ - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h index a666c0270..6880b8d11 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h @@ -41,7 +41,7 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.345; @@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.345; @@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_re unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -113,50 +113,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_ unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 //#ifdef LV_HAVE_SSE3 //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // SSE3 @@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_ unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 #ifdef LV_HAVE_AVX2 @@ -206,50 +206,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_ unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 //#ifdef LV_HAVE_AVX2 //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // AVX2 @@ -268,50 +268,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_ unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 //#ifdef LV_HAVE_AVX2 //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // AVX2 @@ -320,29 +320,29 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_ //#ifdef LV_HAVE_NEON //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // NEON @@ -351,34 +351,31 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_ //#ifdef LV_HAVE_NEON //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // NEON #endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H - - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h index 5aae17266..b294d5ca9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h @@ -68,7 +68,7 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_16sc_t* aPtr = aVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = lv_conj(*aPtr++); } @@ -231,4 +231,3 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c //#endif /* LV_HAVE_NEON */ #endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h index fa9517b76..5d66452e0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h @@ -63,7 +63,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) { unsigned int i; - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); } @@ -82,9 +82,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector lv_32fc_t* _out = outputVector; __m128 a; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg _mm_store_ps((float*)_out, a); _in += 2; _out += 2; @@ -109,9 +109,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector lv_32fc_t* _out = outputVector; __m128 a; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg _mm_storeu_ps((float*)_out, a); _in += 2; _out += 2; @@ -136,15 +136,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector, lv_32fc_t* _out = outputVector; __m256 a; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg _mm256_storeu_ps((float*)_out, a); _in += 4; _out += 4; } _mm256_zeroupper(); - for(i = 0; i < (num_points % 4); ++i) + for (i = 0; i < (num_points % 4); ++i) { *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); _in++; @@ -163,15 +163,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector, lv_32fc_t* _out = outputVector; __m256 a; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg _mm256_store_ps((float*)_out, a); _in += 4; _out += 4; } _mm256_zeroupper(); - for(i = 0; i < (num_points % 4); ++i) + for (i = 0; i < (num_points % 4); ++i) { *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); _in++; @@ -194,7 +194,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector, int32x4_t a32x4; float32x4_t f32x4; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { a16x4 = vld1_s16((const int16_t*)_in); __VOLK_GNSSSDR_PREFETCH(_in + 4); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h index 8f35d59b8..cca2efa0d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h @@ -78,7 +78,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu // resample code for current tap local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f); if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips; - if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips; + if (local_code_chip_index > (code_length_chips - 1)) local_code_chip_index -= code_length_chips; result[n] = local_code[local_code_chip_index]; } } @@ -89,61 +89,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) +static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) { - _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; lv_16sc_t* _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; __m128 _rem_code_phase, _code_phase_step_chips; __m128i _code_length_chips, _code_length_chips_minus1; __m128 _code_phase_out, _code_phase_out_with_offset; rem_code_phase_chips = rem_code_phase_chips - 0.5f; - _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; - four_times_code_length_chips_minus1[0] = code_length_chips-1; - four_times_code_length_chips_minus1[1] = code_length_chips-1; - four_times_code_length_chips_minus1[2] = code_length_chips-1; - four_times_code_length_chips_minus1[3] = code_length_chips-1; + _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; + four_times_code_length_chips_minus1[0] = code_length_chips - 1; + four_times_code_length_chips_minus1[1] = code_length_chips - 1; + four_times_code_length_chips_minus1[2] = code_length_chips - 1; + four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i zero = _mm_setzero_si128(); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; __m128 _4output_index = _mm_load_ps(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; __m128 _4constant_float = _mm_load_ps(init_4constant_float); - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); - _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; @@ -154,7 +159,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul _4output_index = _mm_add_ps(_4output_index, _4constant_float); } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; @@ -169,61 +174,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) +static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) { - _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; lv_16sc_t* _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; __m128 _rem_code_phase, _code_phase_step_chips; __m128i _code_length_chips, _code_length_chips_minus1; __m128 _code_phase_out, _code_phase_out_with_offset; rem_code_phase_chips = rem_code_phase_chips - 0.5f; - _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; - four_times_code_length_chips_minus1[0] = code_length_chips-1; - four_times_code_length_chips_minus1[1] = code_length_chips-1; - four_times_code_length_chips_minus1[2] = code_length_chips-1; - four_times_code_length_chips_minus1[3] = code_length_chips-1; + _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; + four_times_code_length_chips_minus1[0] = code_length_chips - 1; + four_times_code_length_chips_minus1[1] = code_length_chips - 1; + four_times_code_length_chips_minus1[2] = code_length_chips - 1; + four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i zero = _mm_setzero_si128(); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; __m128 _4output_index = _mm_loadu_ps(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; __m128 _4constant_float = _mm_loadu_ps(init_4constant_float); - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); - _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; @@ -234,7 +244,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul _4output_index = _mm_add_ps(_4output_index, _4constant_float); } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; @@ -249,7 +259,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) +static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) { unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; @@ -257,57 +267,62 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, lv_16sc_t* _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; float32x4_t _rem_code_phase, _code_phase_step_chips; int32x4_t _code_length_chips, _code_length_chips_minus1; float32x4_t _code_phase_out, _code_phase_out_with_offset; rem_code_phase_chips = rem_code_phase_chips - 0.5f; float32x4_t sign, PlusHalf, Round; - _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; + _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register + _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register - int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; + int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; uint32x4_t negative_indexes, overflow_indexes; int32x4_t zero = vmovq_n_s32(0); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; float32x4_t _4output_index = vld1q_f32(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; float32x4_t _4constant_float = vld1q_f32(init_4constant_float); - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { - _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); Round = vsubq_f32(PlusHalf, sign); _code_phase_out_int = vcvtq_s32_f32(Round); - negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); - vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; @@ -318,7 +333,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, _4output_index = vaddq_f32(_4output_index, _4constant_float); } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastpuppet_16ic.h index 0b67ce73c..038e70108 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastpuppet_16ic.h @@ -44,7 +44,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_generic(lv_16sc_t* float rem_code_phase_chips = -0.123; float code_phase_step_chips = 0.1; int code_length_chips = 1023; - volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); + volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); } #endif /* LV_HAVE_GENERIC */ @@ -55,7 +55,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_a_sse2(lv_16sc_t* float rem_code_phase_chips = -0.123; float code_phase_step_chips = 0.1; int code_length_chips = 1023; - volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); + volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); } #endif /* LV_HAVE_SSE2 */ @@ -67,7 +67,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_u_sse2(lv_16sc_t* float rem_code_phase_chips = -0.123; float code_phase_step_chips = 0.1; int code_length_chips = 1023; - volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); + volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); } #endif /* LV_HAVE_SSE2 */ @@ -79,9 +79,9 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_neon(lv_16sc_t* re float rem_code_phase_chips = -0.123; float code_phase_step_chips = 0.1; int code_length_chips = 1023; - volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); + volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); } #endif /* LV_HAVE_NEON */ -#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H +#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic.h index bc4c2faa8..934af8e88 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic.h @@ -49,21 +49,21 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_generic(lv_16sc_ int num_out_vectors = 3; unsigned int n; float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - rem_code_phase_chips[n] = -0.234; - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); volk_gnsssdr_free(rem_code_phase_chips); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -77,22 +77,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_a_sse2(lv_16sc_t int code_length_chips = 2046; int num_out_vectors = 3; unsigned int n; - float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - rem_code_phase_chips[n] = -0.234; - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); volk_gnsssdr_free(rem_code_phase_chips); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -106,22 +106,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_u_sse2(lv_16sc_t int code_length_chips = 2046; int num_out_vectors = 3; unsigned int n; - float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - rem_code_phase_chips[n] = -0.234; - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); volk_gnsssdr_free(rem_code_phase_chips); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -135,26 +135,26 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_neon(lv_16sc_t* int code_length_chips = 2046; int num_out_vectors = 3; unsigned int n; - float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - rem_code_phase_chips[n] = -0.234; - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); volk_gnsssdr_free(rem_code_phase_chips); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H +#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h index 85e6fcb08..4582d6961 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h @@ -45,56 +45,56 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; unsigned int n; float rem_code_phase_chips = -0.234; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif /* LV_HAVE_GENERIC */ - + #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -104,26 +104,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -134,26 +134,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -164,26 +164,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -194,26 +194,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -224,26 +224,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -254,29 +254,29 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H +#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h index 15303ead5..0de39ebc3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h @@ -70,7 +70,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou unsigned int i = 0; lv_16sc_t tmp16; lv_32fc_t tmp32; - for(i = 0; i < (unsigned int)(num_points); ++i) + for (i = 0; i < (unsigned int)(num_points); ++i) { tmp16 = *inVector++; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -111,8 +111,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s *outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; } - // Regenerate phase - //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // Regenerate phase + //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -141,11 +141,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out unsigned int number; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -157,49 +159,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_store_si128((__m128i*)_out, result); // Regenerate phase @@ -232,7 +234,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out #endif /* LV_HAVE_SSE3 */ - #ifdef LV_HAVE_SSE3 #include @@ -244,11 +245,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc unsigned int j; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -265,47 +268,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc { for (j = 0; j < ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_store_si128((__m128i*)_out, result); //next two samples @@ -322,47 +325,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_store_si128((__m128i*)_out, result); //next two samples @@ -385,7 +388,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc #endif /* LV_HAVE_SSE3 */ - #ifdef LV_HAVE_SSE3 #include @@ -395,14 +397,16 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out unsigned int number; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; - two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); const lv_16sc_t* _in = inVector; @@ -412,49 +416,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_storeu_si128((__m128i*)_out, result); // Regenerate phase @@ -493,147 +497,149 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) { const unsigned int sse_iters = num_points / 4; - unsigned int ROTATOR_RELOAD = 512; - unsigned int n; - unsigned int j; - __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; - __m128i c1, c2, result; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; - two_phase_inc[0] = phase_inc * phase_inc; - two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; - two_phase_acc[0] = (*phase); - two_phase_acc[1] = (*phase) * phase_inc; - two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + unsigned int ROTATOR_RELOAD = 512; + unsigned int n; + unsigned int j; + __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; + __m128i c1, c2, result; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; + two_phase_inc[0] = phase_inc * phase_inc; + two_phase_inc[1] = phase_inc * phase_inc; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; + two_phase_acc[0] = (*phase); + two_phase_acc[1] = (*phase) * phase_inc; + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); - const lv_16sc_t* _in = inVector; + const lv_16sc_t* _in = inVector; - lv_16sc_t* _out = outVector; + lv_16sc_t* _out = outVector; - __m128 yl, yh, tmp1, tmp2, tmp3; - lv_16sc_t tmp16; - lv_32fc_t tmp32; + __m128 yl, yh, tmp1, tmp2, tmp3; + lv_16sc_t tmp16; + lv_32fc_t tmp32; - for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++) - { - for (j = 0; j < ROTATOR_RELOAD; j++) - { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++) + { + for (j = 0; j < ROTATOR_RELOAD; j++) + { + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples - _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + //next two samples + _in += 2; + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __VOLK_GNSSSDR_PREFETCH(_in + 8); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic - _mm_storeu_si128((__m128i*)_out, result); + // store four output samples + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic + _mm_storeu_si128((__m128i*)_out, result); - //next two samples - _in += 2; - _out += 4; - } - // Regenerate phase - tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); - tmp2 = _mm_hadd_ps(tmp1, tmp1); - tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm_sqrt_ps(tmp1); - two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); - } + //next two samples + _in += 2; + _out += 4; + } + // Regenerate phase + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + } - for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) - { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) + { + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples - _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + //next two samples + _in += 2; + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __VOLK_GNSSSDR_PREFETCH(_in + 8); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic - _mm_storeu_si128((__m128i*)_out, result); + // store four output samples + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic + _mm_storeu_si128((__m128i*)_out, result); - //next two samples - _in += 2; - _out += 4; - } + //next two samples + _in += 2; + _out += 4; + } - _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); - (*phase) = two_phase_acc[0]; + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + (*phase) = two_phase_acc[0]; - for (n = sse_iters * 4; n < num_points; ++n) - { - tmp16 = *_in++; - tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); - *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); - (*phase) *= phase_inc; - } + for (n = sse_iters * 4; n < num_points; ++n) + { + tmp16 = *_in++; + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + } } #endif /* LV_HAVE_SSE3 */ @@ -657,8 +663,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe lv_16sc_t* _out = outVector; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -667,8 +675,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); @@ -681,7 +691,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe if (neon_iters > 0) { - for(; i < neon_iters; ++i) + for (; i < neon_iters; ++i) { /* load 4 complex numbers (int 16 bits each component) */ tmp16 = vld2_s16((int16_t*)_in); @@ -745,8 +755,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -757,7 +769,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe (*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); } - for(i = 0; i < neon_iters % 4; ++i) + for (i = 0; i < neon_iters % 4; ++i) { tmp16_ = *_in++; tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); @@ -791,8 +803,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t lv_16sc_t* _out = outVector; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -801,8 +815,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); @@ -879,8 +895,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -945,7 +963,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t (*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); } - for(i = 0; i < neon_iters % 4; ++i) + for (i = 0; i < neon_iters % 4; ++i) { tmp16_ = *_in++; tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h index 7f6219468..313824556 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h @@ -73,7 +73,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, for (n = 0; n < num_points; n++) { lv_16sc_t tmp = in_a[n] * in_b[n]; - result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) )); + result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp))); } } @@ -96,7 +96,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con if (sse_iters > 0) { __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); @@ -104,25 +105,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_a + 8); b = _mm_load_si128((__m128i*)_in_b); __VOLK_GNSSSDR_PREFETCH(_in_b + 8); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! realcacc = _mm_adds_epi16(realcacc, real); imagcacc = _mm_adds_epi16(imagcacc, imag); @@ -136,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con a = _mm_or_si128(realcacc, imagcacc); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector for (number = 0; number < 4; ++number) { @@ -174,7 +175,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con if (sse_iters > 0) { __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); @@ -182,27 +184,27 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] //imaginery part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_a + 8); b = _mm_loadu_si128((__m128i*)_in_b); __VOLK_GNSSSDR_PREFETCH(_in_b + 8); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! realcacc = _mm_adds_epi16(realcacc, real); imagcacc = _mm_adds_epi16(imagcacc, imag); @@ -216,7 +218,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con result = _mm_or_si128(realcacc, imagcacc); - _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector + _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector for (i = 0; i < 4; ++i) { @@ -253,7 +255,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con if (avx_iters > 0) { __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; realcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256(); @@ -261,7 +264,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { a = _mm256_loadu_si256((__m256i*)_in_a); __VOLK_GNSSSDR_PREFETCH(_in_a + 16); @@ -269,7 +272,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con __VOLK_GNSSSDR_PREFETCH(_in_b + 16); c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); b_sl = _mm256_slli_si256(b, 2); @@ -278,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con imag1 = _mm256_mullo_epi16(a, b_sl); imag2 = _mm256_mullo_epi16(b, a_sl); - imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! realcacc = _mm256_adds_epi16(realcacc, real); imagcacc = _mm256_adds_epi16(imagcacc, imag); @@ -292,7 +295,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con result = _mm256_or_si256(realcacc, imagcacc); - _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector + _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector _mm256_zeroupper(); for (i = 0; i < 8; ++i) @@ -330,7 +333,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con if (avx_iters > 0) { __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; realcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256(); @@ -338,7 +342,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { a = _mm256_load_si256((__m256i*)_in_a); __VOLK_GNSSSDR_PREFETCH(_in_a + 16); @@ -346,7 +350,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con __VOLK_GNSSSDR_PREFETCH(_in_b + 16); c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); b_sl = _mm256_slli_si256(b, 2); @@ -355,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con imag1 = _mm256_mullo_epi16(a, b_sl); imag2 = _mm256_mullo_epi16(b, a_sl); - imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! realcacc = _mm256_adds_epi16(realcacc, real); imagcacc = _mm256_adds_epi16(imagcacc, imag); @@ -369,7 +373,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con result = _mm256_or_si256(realcacc, imagcacc); - _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector + _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector _mm256_zeroupper(); for (i = 0; i < 8; ++i) @@ -397,8 +401,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; *out = lv_cmake((int16_t)0, (int16_t)0); if (quarter_points > 0) @@ -407,15 +411,16 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const // 2nd lane holds the imaginary part int16x4x2_t a_val, b_val, c_val, accumulator; int16x4x2_t tmp_real, tmp_imag; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t accum_result[4]; accumulator.val[0] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0); lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); - for(number = 0; number < quarter_points; ++number) + for (number = 0; number < quarter_points; ++number) { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i __VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8); @@ -451,7 +456,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const } // tail case - for(number = quarter_points * 4; number < num_points; ++number) + for (number = quarter_points * 4; number < num_points; ++number) { *out += (*a_ptr++) * (*b_ptr++); } @@ -468,20 +473,21 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part int16x4x2_t a_val, b_val, accumulator; int16x4x2_t tmp; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t accum_result[4]; accumulator.val[0] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0); - for(number = 0; number < quarter_points; ++number) + for (number = 0; number < quarter_points; ++number) { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i __VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8); @@ -503,7 +509,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points * 4; number < num_points; ++number) + for (number = quarter_points * 4; number < num_points; ++number) { *out += (*a_ptr++) * (*b_ptr++); } @@ -520,22 +526,23 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part int16x4x2_t a_val, b_val, accumulator1, accumulator2; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t accum_result[4]; accumulator1.val[0] = vdup_n_s16(0); accumulator1.val[1] = vdup_n_s16(0); accumulator2.val[0] = vdup_n_s16(0); accumulator2.val[1] = vdup_n_s16(0); - for(number = 0; number < quarter_points; ++number) + for (number = 0; number < quarter_points; ++number) { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i __VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8); @@ -556,7 +563,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points * 4; number < num_points; ++number) + for (number = quarter_points * 4; number < num_points; ++number) { *out += (*a_ptr++) * (*b_ptr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h index c1beceead..065fc75a8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h @@ -74,7 +74,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); for (n = 0; n < num_points; n++) { //r*a.r - i*a.i, i*a.r + r*a.i @@ -96,11 +96,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t* unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); for (n = 0; n < num_points; n++) { - lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))), - sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n])))); + lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), -sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))), + sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } @@ -112,9 +112,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t* #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); int n_vec; unsigned int index; const unsigned int sse_iters = num_points / 4; @@ -125,7 +125,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul if (sse_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -141,25 +142,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(index = 0; index < sse_iters; index++) + for (index = 0; index < sse_iters; index++) { // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... - c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... + c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... + c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... - c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... + c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... + imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm_adds_epi16(c, imag); @@ -176,12 +177,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -191,12 +192,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = sse_iters * 4; index < num_points; index++) + for (index = sse_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -206,9 +207,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); int n_vec; unsigned int index; const unsigned int sse_iters = num_points / 4; @@ -219,7 +220,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul if (sse_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -235,25 +237,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(index = 0; index < sse_iters; index++) + for (index = 0; index < sse_iters; index++) { // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... - c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... + c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... + c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... - c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... + c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... + imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm_adds_epi16(c, imag); @@ -270,12 +272,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -285,12 +287,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = sse_iters * 4; index < num_points; index++) + for (index = sse_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -300,9 +302,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul #ifdef LV_HAVE_AVX2 #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); int n_vec; unsigned int index; const unsigned int sse_iters = num_points / 8; @@ -313,7 +315,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul if (sse_iters > 0) { - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -329,24 +332,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(index = 0; index < sse_iters; index++) + for (index = 0; index < sse_iters; index++) { b = _mm256_load_si256((__m256i*)_in_common); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index*8])); + a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index * 8])); c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); - c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... + c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... - c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... + c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm256_adds_epi16(c, imag); @@ -363,12 +366,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); - _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 8; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -379,12 +382,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = sse_iters * 8; index < num_points; index++) + for (index = sse_iters * 8; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -394,9 +397,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul #ifdef LV_HAVE_AVX2 #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 8; int n_vec; @@ -407,7 +410,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul if (sse_iters > 0) { - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -423,24 +427,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(index = 0; index < sse_iters; index++) + for (index = 0; index < sse_iters; index++) { b = _mm256_loadu_si256((__m256i*)_in_common); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index*8])); + a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index * 8])); c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); - c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... + c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... - c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... + c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm256_adds_epi16(c, imag); @@ -457,12 +461,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); - _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 8; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -473,12 +477,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = sse_iters * 8; index < num_points; index++) + for (index = sse_iters * 8; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -488,9 +492,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); int n_vec; unsigned int index; const unsigned int neon_iters = num_points / 4; @@ -501,7 +505,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, if (neon_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; int16x4x2_t a_val, b_val, c_val; @@ -509,19 +514,19 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, int16x4x2_t tmp_real, tmp_imag; - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0); } - for(index = 0; index < neon_iters; index++) + for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8); // multiply the real*real and imag*imag to get real result @@ -547,12 +552,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -561,12 +566,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = neon_iters * 4; index < num_points; index++) + for (index = neon_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -576,9 +581,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int neon_iters = num_points / 4; int n_vec; @@ -589,25 +594,26 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res if (neon_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; int16x4x2_t a_val, b_val, tmp; int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0); } - for(index = 0; index < neon_iters; index++) + for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); @@ -624,12 +630,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -638,12 +644,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = neon_iters * 4; index < num_points; index++) + for (index = neon_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -653,9 +659,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int neon_iters = num_points / 4; int n_vec; @@ -666,14 +672,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* if (neon_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; int16x4x2_t a_val, b_val; int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator1[n_vec].val[0] = vdup_n_s16(0); accumulator1[n_vec].val[1] = vdup_n_s16(0); @@ -681,13 +688,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* accumulator2[n_vec].val[1] = vdup_n_s16(0); } - for(index = 0; index < neon_iters; index++) + for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); @@ -705,12 +712,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -720,12 +727,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = neon_iters * 4; index < num_points; index++) + for (index = neon_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h index 549fff25d..ad2ec4a77 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h @@ -47,22 +47,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t* int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif /* Generic */ +#endif /* Generic */ #ifdef LV_HAVE_GENERIC @@ -71,22 +71,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic_sat(lv_16sc int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif /* Generic */ +#endif /* Generic */ #ifdef LV_HAVE_SSE2 @@ -95,18 +95,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } @@ -120,18 +120,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } @@ -145,18 +145,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* r int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } @@ -170,18 +170,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* r int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } @@ -195,22 +195,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #ifdef LV_HAVE_NEON @@ -220,22 +220,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #ifdef LV_HAVE_NEON @@ -244,23 +244,21 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h index 2f1036953..596c13bf5 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h @@ -91,29 +91,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] //imaginery part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_load_si128((__m128i*)_in_b); - c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16 (c, c_sr); - real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm_subs_epi16(c, c_sr); + real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); - imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... - result = _mm_or_si128 (real, imag); + result = _mm_or_si128(real, imag); _mm_store_si128((__m128i*)_out, result); @@ -137,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con { const unsigned int sse_iters = num_points / 4; unsigned int number; - __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; + __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result; mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); @@ -145,29 +145,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] //imaginery part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_loadu_si128((__m128i*)_in_b); - c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16 (c, c_sr); - real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm_subs_epi16(c, c_sr); + real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); - imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... - result = _mm_or_si128 (real, imag); + result = _mm_or_si128(real, imag); _mm_storeu_si128((__m128i*)_out, result); @@ -196,29 +196,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; + __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(;number < avx2_points; number++) + for (; number < avx2_points; number++) { - a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi - b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di + a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi + b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); - real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm256_adds_epi16(imag1, imag2); - imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... result = _mm256_or_si256(real, imag); @@ -230,7 +230,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con } _mm256_zeroupper(); number = avx2_points * 8; - for(;number < num_points; number++) + for (; number < num_points; number++) { *_out++ = (*_in_a++) * (*_in_b++); } @@ -250,29 +250,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; + __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(;number < avx2_points; number++) + for (; number < avx2_points; number++) { - a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi - b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di + a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi + b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); - real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm256_adds_epi16(imag1, imag2); - imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... result = _mm256_or_si256(real, imag); @@ -284,7 +284,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con } _mm256_zeroupper(); number = avx2_points * 8; - for(;number < num_points; number++) + for (; number < num_points; number++) { *_out++ = (*_in_a++) * (*_in_b++); } @@ -292,23 +292,22 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con #endif /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_NEON #include static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) { - lv_16sc_t *a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t *b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; unsigned int quarter_points = num_points / 4; int16x4x2_t a_val, b_val, c_val; int16x4x2_t tmp_real, tmp_imag; unsigned int number = 0; - for(number = 0; number < quarter_points; ++number) + for (number = 0; number < quarter_points; ++number) { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i __VOLK_GNSSSDR_PREFETCH(a_ptr + 4); __VOLK_GNSSSDR_PREFETCH(b_ptr + 4); @@ -334,7 +333,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const out += 4; } - for(number = quarter_points * 4; number < num_points; number++) + for (number = quarter_points * 4; number < num_points; number++) { *out++ = (*a_ptr++) * (*b_ptr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h index 0cfc9df61..60b5b7b38 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h @@ -85,11 +85,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points; n++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); @@ -130,14 +130,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( const unsigned int ROTATOR_RELOAD = 256; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) { for (j = 0; j < ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; @@ -148,7 +148,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } - /* Regenerate phase */ + /* Regenerate phase */ #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -159,13 +159,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( for (j = 0; j < num_points % ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - lv_16sc_t tmp = tmp16 * in_a[n_vec][ (num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j ]; + lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } @@ -178,9 +178,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; int n_vec; @@ -191,7 +191,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -210,11 +211,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -222,69 +225,69 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); @@ -309,12 +312,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -331,7 +334,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -343,7 +346,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -353,9 +356,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; const unsigned int ROTATOR_RELOAD = 128; @@ -369,7 +372,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -388,11 +392,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -400,71 +406,71 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l lv_16sc_t tmp16; lv_32fc_t tmp32; - for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) + for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) { for (j = 0; j < ROTATOR_RELOAD; j++) { // Phase rotation on operand in_common starts here: //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); @@ -482,65 +488,65 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) { - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); @@ -556,12 +562,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -579,7 +585,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -591,10 +597,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } - } #endif /* LV_HAVE_SSE3 */ @@ -602,9 +607,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; int n_vec; @@ -615,7 +620,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -634,11 +640,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_loadu_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_loadu_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_loadu_ps((float*)two_phase_acc); @@ -646,69 +654,69 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); @@ -733,12 +741,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (j = 0; j < 4; ++j) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[j])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j]))); } _out[n_vec] = dotProduct; } @@ -748,7 +756,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -758,7 +766,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -768,7 +776,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ #ifdef LV_HAVE_AVX2 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int avx2_iters = num_points / 8; const lv_16sc_t** _in_a = in_a; @@ -781,8 +789,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ lv_16sc_t tmp16; lv_32fc_t tmp32; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - lv_16sc_t dotProduct = lv_cmake(0,0); + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0, 0); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -798,104 +807,106 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result1, result2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; - two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); __m256i a2, b2, c, c_sr, real, imag; __m128 yl, yh, tmp1, tmp2, tmp3; - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result2 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) @@ -904,7 +915,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ c = _mm256_mullo_epi16(a2, b2); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); c_sr = _mm256_slli_si256(b2, 2); @@ -936,12 +947,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ a2 = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); - _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (number = 0; number < 8; ++number) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } _out[n_vec] = dotProduct; } @@ -953,7 +964,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = avx2_iters * 8; n < num_points; n++) + for (n = avx2_iters * 8; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -963,10 +974,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } - } #endif /* LV_HAVE_AVX2 */ @@ -974,7 +984,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ #ifdef LV_HAVE_AVX2 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int avx2_iters = num_points / 8; const unsigned int ROTATOR_RELOAD = 128; @@ -989,8 +999,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l lv_16sc_t tmp16; lv_32fc_t tmp32; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - lv_16sc_t dotProduct = lv_cmake(0,0); + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0, 0); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -1006,106 +1017,108 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result1, result2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; - two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); __m256i a2, b2, c, c_sr, real, imag; __m128 yl, yh, tmp1, tmp2, tmp3; - for (number = 0; number < avx2_iters / ROTATOR_RELOAD; ++number) + for (number = 0; number < avx2_iters / ROTATOR_RELOAD; ++number) { for (j = 0; j < ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result2 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) @@ -1114,7 +1127,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l c = _mm256_mullo_epi16(a2, b2); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); c_sr = _mm256_slli_si256(b2, 2); @@ -1139,98 +1152,98 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result2 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][((avx2_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 8])); + a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][((avx2_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 8])); c = _mm256_mullo_epi16(a2, b2); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); c_sr = _mm256_slli_si256(b2, 2); @@ -1253,12 +1266,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l a2 = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); - _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (j = 0; j < 8; ++j) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[j])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j]))); } _out[n_vec] = dotProduct; } @@ -1269,7 +1282,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = avx2_iters * 8; n < num_points; n++) + for (n = avx2_iters * 8; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -1279,7 +1292,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -1290,7 +1303,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int neon_iters = num_points / 4; @@ -1306,14 +1319,16 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* if (neon_iters > 0) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); float arg_phase0 = cargf(*phase); float arg_phase_inc = cargf(phase_inc); float phase_est; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -1322,14 +1337,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); int16x4x2_t a_val, b_val, c_val; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; float32x4_t half = vdupq_n_f32(0.5f); int16x4x2_t tmp16; int32x4x2_t tmp32i; @@ -1339,13 +1357,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0); } - for(number = 0; number < neon_iters; number++) + for (number = 0; number < neon_iters; number++) { /* load 4 complex numbers (int 16 bits each component) */ tmp16 = vld2_s16((int16_t*)_in_common); @@ -1396,7 +1414,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); // multiply the real*real and imag*imag to get real result @@ -1426,8 +1444,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -1436,12 +1456,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -1473,7 +1493,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* #include #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int neon_iters = num_points / 4; @@ -1489,14 +1509,16 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s if (neon_iters > 0) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); float arg_phase0 = cargf(*phase); float arg_phase_inc = cargf(phase_inc); float phase_est; //printf("arg phase0: %f", arg_phase0); lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -1505,14 +1527,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); int16x4x2_t a_val, b_val; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; float32x4_t half = vdupq_n_f32(0.5f); int16x4x2_t tmp16; int32x4x2_t tmp32i; @@ -1522,13 +1547,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0); } - for(number = 0; number < neon_iters; number++) + for (number = 0; number < neon_iters; number++) { /* load 4 complex numbers (int 16 bits each component) */ tmp16 = vld2_s16((int16_t*)_in_common); @@ -1589,8 +1614,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -1598,19 +1625,18 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s // Round = vmulq_f32(_phase_real, _phase_real); // Round = vmlaq_f32(Round, _phase_imag, _phase_imag); // Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); - //Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); + //Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); //Round = vrecpeq_f32((Round); // _phase_real = vdivq_f32(_phase_real, Round); // _phase_imag = vdivq_f32(_phase_imag, Round); //_phase_real = vmulq_f32(_phase_real, Round); //_phase_imag = vmulq_f32(_phase_imag, Round); //printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); - } for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); @@ -1626,12 +1652,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -1664,7 +1690,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s #include #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int neon_iters = num_points / 4; @@ -1680,14 +1706,16 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ if (neon_iters > 0) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); float arg_phase0 = cargf(*phase); float arg_phase_inc = cargf(phase_inc); float phase_est; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -1696,14 +1724,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); int16x4x2_t a_val, b_val; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; float32x4_t half = vdupq_n_f32(0.5f); int32x4x2_t tmp32i; @@ -1713,7 +1744,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator1[n_vec].val[0] = vdup_n_s16(0); accumulator1[n_vec].val[1] = vdup_n_s16(0); @@ -1721,7 +1752,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ accumulator2[n_vec].val[1] = vdup_n_s16(0); } - for(number = 0; number < neon_iters; number++) + for (number = 0; number < neon_iters; number++) { /* load 4 complex numbers (int 16 bits each component) */ b_val = vld2_s16((int16_t*)_in_common); @@ -1782,8 +1813,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -1791,7 +1824,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); // use 2 accumulators to remove inter-instruction data dependencies accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); @@ -1807,12 +1840,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ } for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -1842,4 +1875,3 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ #endif /* LV_HAVE_NEON */ #endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h index cf002bf6c..9b30bdbbd 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h @@ -41,7 +41,7 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.345; @@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_ unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_ #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.345; @@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_rel unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -113,22 +113,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1 unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 #ifdef LV_HAVE_SSE3 @@ -144,22 +144,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3_relo unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 #ifdef LV_HAVE_SSE3 @@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1 unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 #ifdef LV_HAVE_AVX2 @@ -206,22 +206,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2(lv_1 unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 #ifdef LV_HAVE_AVX2 @@ -237,22 +237,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2_relo unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 #ifdef LV_HAVE_AVX2 @@ -268,22 +268,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2(lv_1 unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 #ifdef LV_HAVE_AVX2 @@ -299,22 +299,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_relo unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 #ifdef LV_HAVE_NEON @@ -330,22 +330,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16s unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #ifdef LV_HAVE_NEON @@ -361,23 +361,21 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h index 843fa8ed2..661f4ace9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h @@ -106,7 +106,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -120,7 +121,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -138,13 +139,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -156,7 +157,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r } } -#endif +#endif #ifdef LV_HAVE_SSE4_1 @@ -172,7 +173,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -186,7 +188,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -204,13 +206,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -239,7 +241,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -253,7 +256,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -274,13 +277,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -309,7 +312,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -323,7 +327,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -344,13 +348,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -378,7 +382,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -393,7 +398,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -411,13 +416,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -427,7 +432,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -455,7 +460,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -470,7 +476,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -488,13 +494,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -504,7 +510,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -530,7 +536,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int32_t local_code_chip_index[4]; int32_t local_code_chip_index_; const int32x4_t zeros = vdupq_n_s32(0); @@ -538,11 +545,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); int32x4_t local_code_chip_index_reg, aux_i, negatives, i; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; - __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f}; uint32x4_t igx; reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! float32x4_t n0 = vld1q_f32((float*)vec); int current_correlator_tap; unsigned int n; @@ -552,7 +560,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < neon_iters; n++) + for (n = 0; n < neon_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); @@ -568,7 +576,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul // fmod c = vmulq_f32(aux, reciprocal); - i = vcvtq_s32_f32(c); + i = vcvtq_s32_f32(c); cTrunc = vcvtq_f32_s32(i); base = vmulq_f32(cTrunc, code_length_chips_reg_f); aux = vsubq_f32(aux, base); @@ -580,13 +588,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = vaddq_f32(indexn, fours); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap @@ -604,4 +612,3 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul #endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h index a31cba3a5..d583595a4 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h @@ -95,69 +95,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) +static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; lv_16sc_t** _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; float tmp_rem_code_phase_chips; - __m128 _rem_code_phase,_code_phase_step_chips; - __m128i _code_length_chips,_code_length_chips_minus1; - __m128 _code_phase_out,_code_phase_out_with_offset; + __m128 _rem_code_phase, _code_phase_step_chips; + __m128i _code_length_chips, _code_length_chips_minus1; + __m128 _code_phase_out, _code_phase_out_with_offset; - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register - __m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over; + __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i zero = _mm_setzero_si128(); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; __m128 _4output_index = _mm_load_ps(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; __m128 _4constant_float = _mm_load_ps(init_4constant_float); int current_vector = 0; int sample_idx = 0; - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { //common to all outputs - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step //output vector dependant (different code phase offset) - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { - tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register + tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) + _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); - _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; @@ -169,9 +174,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* sample_idx += 4; } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; @@ -186,69 +191,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) +static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; lv_16sc_t** _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; float tmp_rem_code_phase_chips; - __m128 _rem_code_phase,_code_phase_step_chips; - __m128i _code_length_chips,_code_length_chips_minus1; - __m128 _code_phase_out,_code_phase_out_with_offset; + __m128 _rem_code_phase, _code_phase_step_chips; + __m128i _code_length_chips, _code_length_chips_minus1; + __m128 _code_phase_out, _code_phase_out_with_offset; - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register - __m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over; + __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i zero = _mm_setzero_si128(); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; __m128 _4output_index = _mm_loadu_ps(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; __m128 _4constant_float = _mm_loadu_ps(init_4constant_float); int current_vector = 0; int sample_idx = 0; - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { //common to all outputs - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step //output vector dependant (different code phase offset) - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { - tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register + tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) + _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); - _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; @@ -260,9 +270,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* sample_idx += 4; } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; @@ -278,74 +288,79 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) +static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; float32x4_t half = vdupq_n_f32(0.5f); lv_16sc_t** _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; float tmp_rem_code_phase_chips; float32x4_t _rem_code_phase, _code_phase_step_chips; int32x4_t _code_length_chips, _code_length_chips_minus1; float32x4_t _code_phase_out, _code_phase_out_with_offset; float32x4_t sign, PlusHalf, Round; - _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; + _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register - _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register + _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register + _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register - int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; + int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; uint32x4_t negative_indexes, overflow_indexes; int32x4_t zero = vmovq_n_s32(0); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; float32x4_t _4output_index = vld1q_f32(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; float32x4_t _4constant_float = vld1q_f32(init_4constant_float); int current_vector = 0; int sample_idx = 0; - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { //common to all outputs - _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step //output vector dependant (different code phase offset) - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { - tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register + tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) + _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register - _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset //_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); Round = vsubq_f32(PlusHalf, sign); _code_phase_out_int = vcvtq_s32_f32(Round); - negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); - vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; @@ -357,9 +372,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** sample_idx += 4; } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_32u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_32u.h index af5e609cb..ace8271ea 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_32u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_32u.h @@ -29,7 +29,6 @@ */ - /*! * \page volk_gnsssdr_32f_index_max_32u.h * @@ -63,7 +62,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 8; @@ -71,7 +70,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* inputPtr = (float*)src0; __m256 indexIncrementValues = _mm256_set1_ps(8); - __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); float max = src0[0]; float index = 0; @@ -80,25 +79,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const __m256 compareResults; __m256 currentValues; - __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) + float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) + float maxIndexesBuffer[8]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; + currentValues = _mm256_load_ps(inputPtr); + inputPtr += 8; currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); + maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); } // Calculate the largest value from the remaining 8 points _mm256_store_ps(maxValuesBuffer, maxValues); _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 8; number++) + for (number = 0; number < 8; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -106,9 +108,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const } number = quarterPoints * 8; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -126,7 +128,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 8; @@ -134,7 +136,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* inputPtr = (float*)src0; __m256 indexIncrementValues = _mm256_set1_ps(8); - __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); float max = src0[0]; float index = 0; @@ -143,25 +145,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const __m256 compareResults; __m256 currentValues; - __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) + float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) + float maxIndexesBuffer[8]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; + currentValues = _mm256_loadu_ps(inputPtr); + inputPtr += 8; currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); + maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); } // Calculate the largest value from the remaining 8 points _mm256_store_ps(maxValuesBuffer, maxValues); _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 8; number++) + for (number = 0; number < 8; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -169,9 +174,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const } number = quarterPoints * 8; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -185,11 +190,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const #ifdef LV_HAVE_SSE4_1 -#include +#include static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; @@ -197,7 +202,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con float* inputPtr = (float*)src0; __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); float max = src0[0]; float index = 0; @@ -206,25 +211,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con __m128 compareResults; __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues); maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); } // Calculate the largest value from the remaining 4 points _mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -232,9 +240,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -248,11 +256,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con #ifdef LV_HAVE_SSE4_1 -#include +#include static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; @@ -260,7 +268,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con float* inputPtr = (float*)src0; __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); float max = src0[0]; float index = 0; @@ -269,25 +277,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con __m128 compareResults; __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues); maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); } // Calculate the largest value from the remaining 4 points _mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -295,9 +306,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -312,11 +323,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con #ifdef LV_HAVE_SSE -#include +#include static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; @@ -324,7 +335,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* inputPtr = (float*)src0; __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); float max = src0[0]; float index = 0; @@ -333,25 +344,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const __m128 compareResults; __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues); - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes)); + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues)); } // Calculate the largest value from the remaining 4 points _mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -359,9 +373,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -376,11 +390,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const #ifdef LV_HAVE_SSE -#include +#include static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; @@ -388,7 +402,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* inputPtr = (float*)src0; __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); float max = src0[0]; float index = 0; @@ -397,25 +411,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const __m128 compareResults; __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues); - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes)); + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues)); } // Calculate the largest value from the remaining 4 points _mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -423,9 +440,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -442,16 +459,16 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { float max = src0[0]; uint32_t index = 0; uint32_t i = 1; - for(; i < num_points; ++i) + for (; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -469,14 +486,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, cons static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; float* inputPtr = (float*)src0; float32x4_t indexIncrementValues = vdupq_n_f32(4); - __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; + __VOLK_ATTR_ALIGNED(16) + float currentIndexes_float[4] = {-4.0f, -3.0f, -2.0f, -1.0f}; float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); float max = src0[0]; @@ -487,25 +505,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f uint32x4_t currentIndexes_u; float32x4_t currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = vld1q_f32(inputPtr); inputPtr += 4; - currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); + currentValues = vld1q_f32(inputPtr); + inputPtr += 4; + currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); currentIndexes_u = vcvtq_u32_f32(currentIndexes); - compareResults = vcgtq_f32( maxValues, currentValues); - maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) ); - maxValues = vmaxq_f32(currentValues, maxValues); + compareResults = vcgtq_f32(maxValues, currentValues); + maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex), vbicq_u32(currentIndexes_u, compareResults)); + maxValues = vmaxq_f32(currentValues, maxValues); } // Calculate the largest value from the remaining 4 points vst1q_f32(maxValuesBuffer, maxValues); vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -513,9 +534,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -528,4 +549,3 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f #endif /*LV_HAVE_NEON*/ #endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h index cf2a80f52..b425ecb9b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h @@ -42,31 +42,30 @@ #include - #ifdef LV_HAVE_GENERIC static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -77,26 +76,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -106,26 +105,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -136,26 +135,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -165,26 +164,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -194,26 +193,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -223,26 +222,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, c static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -251,29 +250,28 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, c static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H - +#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h index 5568976e8..b067c5f3d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h @@ -97,7 +97,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f cp4 = _mm_set1_ps(0.49603e-4); cp5 = _mm_set1_ps(0.551e-6); - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { aVal = _mm_loadu_ps(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -108,12 +108,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction s = _mm_mul_ps(s, s); // Evaluate Taylor series s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - for(i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); } @@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { float _in = *aPtr++; *bPtr++ = lv_cmake(cosf(_in), sinf(_in)); @@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f cp4 = _mm_set1_ps(0.49603e-4); cp5 = _mm_set1_ps(0.551e-6); - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { aVal = _mm_load_ps(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -202,12 +202,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction s = _mm_mul_ps(s, s); // Evaluate Taylor series s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - for(i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); } @@ -239,7 +239,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { float _in = *aPtr++; *bPtr++ = lv_cmake(cosf(_in), sinf(_in)); @@ -265,31 +265,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo __m128 sine, cosine, aux, x; __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; - __m128i emm0, emm2, emm4; + __m128i emm0, emm2, emm4; /* declare some SSE constants */ - __VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_1[4] = {1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_2[4] = {2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_4[4] = {4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f}; - for(;number < sse_iters; number++) + for (; number < sse_iters; number++) { x = _mm_load_ps(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -307,19 +325,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1); - emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1); + emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ - emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4); + emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ - emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); @@ -335,15 +353,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); - emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2); - emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4); + emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); + emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x,x); + __m128 z = _mm_mul_ps(x, x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); @@ -371,11 +389,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); + y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); /* update the sign */ sine = _mm_xor_ps(xmm1, sign_bit_sin); @@ -392,12 +410,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo aPtr += 4; } - for(number = sse_iters * 4; number < num_points; number++) + for (number = sse_iters * 4; number < num_points; number++) { _in = *aPtr++; - *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); + *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); } - } #endif /* LV_HAVE_SSE2 */ @@ -418,31 +435,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo __m128 sine, cosine, aux, x; __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; - __m128i emm0, emm2, emm4; + __m128i emm0, emm2, emm4; /* declare some SSE constants */ - __VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_1[4] = {1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_2[4] = {2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_4[4] = {4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f}; - for(;number < sse_iters; number++) + for (; number < sse_iters; number++) { x = _mm_loadu_ps(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -460,19 +495,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1); - emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1); + emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ - emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4); + emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ - emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); @@ -488,15 +523,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); - emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2); - emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4); + emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); + emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x,x); + __m128 z = _mm_mul_ps(x, x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); @@ -524,11 +559,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); + y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); /* update the sign */ sine = _mm_xor_ps(xmm1, sign_bit_sin); @@ -545,12 +580,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo aPtr += 4; } - for(number = sse_iters * 4; number < num_points; number++) + for (number = sse_iters * 4; number < num_points; number++) { _in = *aPtr++; - *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); + *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); } - } #endif /* LV_HAVE_SSE2 */ @@ -561,10 +595,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic(lv_32fc_t* out, const fl { float _in; unsigned int i; - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { _in = *in++; - *out++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); + *out++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); } } @@ -586,12 +620,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con const int32_t diffbits = bitlength - Nbits; uint32_t ux; unsigned int i; - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { _in = *in++; d = (int32_t)floor(_in / TWO_PI + 0.5); _in -= d * TWO_PI; - x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI); + x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI); ux = x; sin_index = ux >> diffbits; @@ -601,7 +635,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con cos_index = ux >> diffbits; c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1]; - *out++ = lv_cmake((float)c, (float)s ); + *out++ = lv_cmake((float)c, (float)s); } } @@ -637,7 +671,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos; - for(;number < neon_iters; number++) + for (; number < neon_iters; number++) { x = vld1q_f32(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -677,7 +711,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float /* Evaluate the first polynom (0 <= x <= Pi/4) in y1, and the second polynom (Pi/4 <= x <= 0) in y2 */ - z = vmulq_f32(x,x); + z = vmulq_f32(x, x); y1 = vmulq_n_f32(z, c_coscof_p0); y2 = vmulq_n_f32(z, c_sincof_p0); @@ -706,10 +740,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float aPtr += 4; } - for(number = neon_iters * 4; number < num_points; number++) + for (number = neon_iters * 4; number < num_points; number++) { _in = *aPtr++; - *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); + *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); } } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h index 1fa95e0e6..f130032ea 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h @@ -110,7 +110,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -124,7 +125,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -145,25 +146,25 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } } } -#endif +#endif #ifdef LV_HAVE_SSE3 @@ -180,7 +181,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -194,7 +196,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -215,18 +217,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -248,7 +250,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -262,7 +265,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -280,25 +283,25 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } } } -#endif +#endif #ifdef LV_HAVE_SSE4_1 @@ -314,7 +317,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -328,7 +332,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -346,18 +350,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -380,7 +384,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -395,7 +400,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -413,13 +418,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -429,12 +434,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -457,7 +462,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -472,7 +478,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -490,13 +496,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -506,12 +512,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -536,19 +542,21 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int32_t local_code_chip_index[4]; int32_t local_code_chip_index_; const int32x4_t zeros = vdupq_n_s32(0); const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); - int32x4_t local_code_chip_index_reg, aux_i, negatives, i; + int32x4_t local_code_chip_index_reg, aux_i, negatives, i; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; - __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f}; uint32x4_t igx; reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! float32x4_t n0 = vld1q_f32((float*)vec); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) @@ -556,7 +564,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < neon_iters; n++) + for (n = 0; n < neon_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); @@ -572,7 +580,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con // fmod c = vmulq_f32(aux, reciprocal); - i = vcvtq_s32_f32(c); + i = vcvtq_s32_f32(c); cTrunc = vcvtq_f32_s32(i); base = vmulq_f32(cTrunc, code_length_chips_reg_f); aux = vsubq_f32(aux, base); @@ -584,13 +592,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = vaddq_f32(indexn, fours); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap @@ -606,5 +614,3 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con #endif #endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/ - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h index e8831a97f..211d979cf 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h @@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points; n++) { - tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); // Regenerate phase if (n % 256 == 0) @@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload unsigned int j; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) @@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload result[n_vec] += tmp32_2; } } - /* Regenerate phase */ + /* Regenerate phase */ #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -175,8 +175,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ const unsigned int sixteenthPoints = num_points / 16; const float* aPtr = (float*)in_common; - const float* bPtr[ num_a_vectors]; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + const float* bPtr[num_a_vectors]; + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { bPtr[vec_ind] = in_a[vec_ind]; } @@ -194,7 +194,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ __m256 dotProdVal2[num_a_vectors]; __m256 dotProdVal3[num_a_vectors]; - for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ ) + for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++) { dotProdVal0[vec_ind] = _mm256_setzero_ps(); dotProdVal1[vec_ind] = _mm256_setzero_ps(); @@ -204,57 +204,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ // Set up the complex rotator __m256 z0, z1, z2, z3; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16]; - for( vec_ind = 0; vec_ind < 16; ++vec_ind ) + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t phase_vec[16]; + for (vec_ind = 0; vec_ind < 16; ++vec_ind) { phase_vec[vec_ind] = _phase; _phase *= phase_inc; } - z0 = _mm256_load_ps( (float *)phase_vec ); - z1 = _mm256_load_ps( (float *)(phase_vec + 4) ); - z2 = _mm256_load_ps( (float *)(phase_vec + 8) ); - z3 = _mm256_load_ps( (float *)(phase_vec + 12) ); + z0 = _mm256_load_ps((float*)phase_vec); + z1 = _mm256_load_ps((float*)(phase_vec + 4)); + z2 = _mm256_load_ps((float*)(phase_vec + 8)); + z3 = _mm256_load_ps((float*)(phase_vec + 12)); - lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; + lv_32fc_t dz = phase_inc; + dz *= dz; + dz *= dz; + dz *= dz; + dz *= dz; // dz = phase_inc^16; - for( vec_ind = 0; vec_ind < 4; ++vec_ind ) + for (vec_ind = 0; vec_ind < 4; ++vec_ind) { phase_vec[vec_ind] = dz; } - __m256 dz_reg = _mm256_load_ps( (float *)phase_vec ); - dz_reg = _mm256_complexnormalise_ps( dz_reg ); + __m256 dz_reg = _mm256_load_ps((float*)phase_vec); + dz_reg = _mm256_complexnormalise_ps(dz_reg); - for(;number < sixteenthPoints; number++) + for (; number < sixteenthPoints; number++) { a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - a2Val = _mm256_loadu_ps(aPtr+16); - a3Val = _mm256_loadu_ps(aPtr+24); + a1Val = _mm256_loadu_ps(aPtr + 8); + a2Val = _mm256_loadu_ps(aPtr + 16); + a3Val = _mm256_loadu_ps(aPtr + 24); - a0Val = _mm256_complexmul_ps( a0Val, z0 ); - a1Val = _mm256_complexmul_ps( a1Val, z1 ); - a2Val = _mm256_complexmul_ps( a2Val, z2 ); - a3Val = _mm256_complexmul_ps( a3Val, z3 ); + a0Val = _mm256_complexmul_ps(a0Val, z0); + a1Val = _mm256_complexmul_ps(a1Val, z1); + a2Val = _mm256_complexmul_ps(a2Val, z2); + a3Val = _mm256_complexmul_ps(a3Val, z3); - z0 = _mm256_complexmul_ps( z0, dz_reg ); - z1 = _mm256_complexmul_ps( z1, dz_reg ); - z2 = _mm256_complexmul_ps( z2, dz_reg ); - z3 = _mm256_complexmul_ps( z3, dz_reg ); + z0 = _mm256_complexmul_ps(z0, dz_reg); + z1 = _mm256_complexmul_ps(z1, dz_reg); + z2 = _mm256_complexmul_ps(z2, dz_reg); + z3 = _mm256_complexmul_ps(z3, dz_reg); - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { - x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 - x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8); - x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 - x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 + x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8); + x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]); x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]); // TODO: it may be possible to rearrange swizzling to better pipeline data - b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 - b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20); b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31); @@ -274,43 +279,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ // Force the rotators back onto the unit circle if ((number % 64) == 0) { - z0 = _mm256_complexnormalise_ps( z0 ); - z1 = _mm256_complexnormalise_ps( z1 ); - z2 = _mm256_complexnormalise_ps( z2 ); - z3 = _mm256_complexnormalise_ps( z3 ); + z0 = _mm256_complexnormalise_ps(z0); + z1 = _mm256_complexnormalise_ps(z1); + z2 = _mm256_complexnormalise_ps(z2); + z3 = _mm256_complexnormalise_ps(z3); } aPtr += 32; } - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]); - _mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector - result[ vec_ind ] = lv_cmake( 0, 0 ); - for( i = 0; i < 4; ++i ) + result[vec_ind] = lv_cmake(0, 0); + for (i = 0; i < 4; ++i) { result[vec_ind] += dotProductVector[i]; } } - z0 = _mm256_complexnormalise_ps( z0 ); + z0 = _mm256_complexnormalise_ps(z0); _mm256_store_ps((float*)phase_vec, z0); _phase = phase_vec[0]; _mm256_zeroupper(); - number = sixteenthPoints*16; - for(;number < num_points; number++) + number = sixteenthPoints * 16; + for (; number < num_points; number++) { - wo = (*aPtr++)*_phase; + wo = (*aPtr++) * _phase; _phase *= phase_inc; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { result[vec_ind] += wo * in_a[vec_ind][number]; } @@ -333,8 +339,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ const unsigned int sixteenthPoints = num_points / 16; const float* aPtr = (float*)in_common; - const float* bPtr[ num_a_vectors]; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + const float* bPtr[num_a_vectors]; + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { bPtr[vec_ind] = in_a[vec_ind]; } @@ -352,7 +358,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ __m256 dotProdVal2[num_a_vectors]; __m256 dotProdVal3[num_a_vectors]; - for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ ) + for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++) { dotProdVal0[vec_ind] = _mm256_setzero_ps(); dotProdVal1[vec_ind] = _mm256_setzero_ps(); @@ -362,58 +368,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ // Set up the complex rotator __m256 z0, z1, z2, z3; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16]; - for( vec_ind = 0; vec_ind < 16; ++vec_ind ) + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t phase_vec[16]; + for (vec_ind = 0; vec_ind < 16; ++vec_ind) { phase_vec[vec_ind] = _phase; _phase *= phase_inc; } - z0 = _mm256_load_ps( (float *)phase_vec ); - z1 = _mm256_load_ps( (float *)(phase_vec + 4) ); - z2 = _mm256_load_ps( (float *)(phase_vec + 8) ); - z3 = _mm256_load_ps( (float *)(phase_vec + 12) ); + z0 = _mm256_load_ps((float*)phase_vec); + z1 = _mm256_load_ps((float*)(phase_vec + 4)); + z2 = _mm256_load_ps((float*)(phase_vec + 8)); + z3 = _mm256_load_ps((float*)(phase_vec + 12)); - lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; + lv_32fc_t dz = phase_inc; + dz *= dz; + dz *= dz; + dz *= dz; + dz *= dz; // dz = phase_inc^16; - for( vec_ind = 0; vec_ind < 4; ++vec_ind ) + for (vec_ind = 0; vec_ind < 4; ++vec_ind) { phase_vec[vec_ind] = dz; } - __m256 dz_reg = _mm256_load_ps( (float *)phase_vec ); - dz_reg = _mm256_complexnormalise_ps( dz_reg ); + __m256 dz_reg = _mm256_load_ps((float*)phase_vec); + dz_reg = _mm256_complexnormalise_ps(dz_reg); - for(;number < sixteenthPoints; number++) + for (; number < sixteenthPoints; number++) { - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - a2Val = _mm256_load_ps(aPtr+16); - a3Val = _mm256_load_ps(aPtr+24); + a1Val = _mm256_load_ps(aPtr + 8); + a2Val = _mm256_load_ps(aPtr + 16); + a3Val = _mm256_load_ps(aPtr + 24); - a0Val = _mm256_complexmul_ps( a0Val, z0 ); - a1Val = _mm256_complexmul_ps( a1Val, z1 ); - a2Val = _mm256_complexmul_ps( a2Val, z2 ); - a3Val = _mm256_complexmul_ps( a3Val, z3 ); + a0Val = _mm256_complexmul_ps(a0Val, z0); + a1Val = _mm256_complexmul_ps(a1Val, z1); + a2Val = _mm256_complexmul_ps(a2Val, z2); + a3Val = _mm256_complexmul_ps(a3Val, z3); - z0 = _mm256_complexmul_ps( z0, dz_reg ); - z1 = _mm256_complexmul_ps( z1, dz_reg ); - z2 = _mm256_complexmul_ps( z2, dz_reg ); - z3 = _mm256_complexmul_ps( z3, dz_reg ); + z0 = _mm256_complexmul_ps(z0, dz_reg); + z1 = _mm256_complexmul_ps(z1, dz_reg); + z2 = _mm256_complexmul_ps(z2, dz_reg); + z3 = _mm256_complexmul_ps(z3, dz_reg); - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { - x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 - x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8); - x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 - x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 + x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8); + x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]); x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]); // TODO: it may be possible to rearrange swizzling to better pipeline data - b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 - b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20); b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31); @@ -433,43 +443,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ // Force the rotators back onto the unit circle if ((number % 64) == 0) { - z0 = _mm256_complexnormalise_ps( z0 ); - z1 = _mm256_complexnormalise_ps( z1 ); - z2 = _mm256_complexnormalise_ps( z2 ); - z3 = _mm256_complexnormalise_ps( z3 ); + z0 = _mm256_complexnormalise_ps(z0); + z1 = _mm256_complexnormalise_ps(z1); + z2 = _mm256_complexnormalise_ps(z2); + z3 = _mm256_complexnormalise_ps(z3); } aPtr += 32; } - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]); - _mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector - result[ vec_ind ] = lv_cmake( 0, 0 ); - for( i = 0; i < 4; ++i ) + result[vec_ind] = lv_cmake(0, 0); + for (i = 0; i < 4; ++i) { result[vec_ind] += dotProductVector[i]; } } - z0 = _mm256_complexnormalise_ps( z0 ); + z0 = _mm256_complexnormalise_ps(z0); _mm256_store_ps((float*)phase_vec, z0); _phase = phase_vec[0]; _mm256_zeroupper(); - number = sixteenthPoints*16; - for(;number < num_points; number++) + number = sixteenthPoints * 16; + for (; number < num_points; number++) { - wo = (*aPtr++)*_phase; + wo = (*aPtr++) * _phase; _phase *= phase_inc; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { result[vec_ind] += wo * in_a[vec_ind][number]; } @@ -482,5 +493,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */ - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h index ca684e30b..0804dd651 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h @@ -42,7 +42,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -53,15 +53,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); unsigned int n; int num_a_vectors = 3; - float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) { - in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } - volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -82,15 +82,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); unsigned int n; int num_a_vectors = 3; - float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) { - in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } - volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -100,7 +100,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re #endif // Generic #ifdef LV_HAVE_AVX -static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -111,15 +111,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3 phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); unsigned int n; int num_a_vectors = 3; - float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) { - in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } - volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -130,7 +130,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3 #ifdef LV_HAVE_AVX -static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -141,15 +141,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3 phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); unsigned int n; int num_a_vectors = 3; - float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) { - in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } - volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -159,4 +159,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3 #endif // AVX #endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h index b04a93c4b..892a7c0e8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h @@ -80,10 +80,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); // Clip @@ -99,12 +101,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector outputVectorPtr += 8; } - for(i = sse_iters * 8; i < num_points * 2; i++) + for (i = sse_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -128,15 +130,17 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const float max_val = (float)SHRT_MAX; __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h? + __m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h? __m128 ret1, ret2; const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0;i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); // Clip @@ -152,12 +156,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, outputVectorPtr += 8; } - for(i = sse_iters * 8; i < num_points*2; i++) + for (i = sse_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -175,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector int16_t* outputVectorPtr = (int16_t*)outputVector; float aux; unsigned int i; - const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast + const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast const float max_val = (float)SHRT_MAX; __m256 inputVal1, inputVal2; @@ -184,10 +188,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmax_val = _mm256_set1_ps(max_val); - for(i = 0; i < avx2_iters; i++) + for (i = 0; i < avx2_iters; i++) { - inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); // Clip @@ -204,12 +210,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector outputVectorPtr += 16; } - for(i = avx2_iters * 16; i < num_points * 2; i++) + for (i = avx2_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -238,10 +244,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); // Clip @@ -257,12 +265,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector outputVectorPtr += 8; } - for(i = sse_iters * 8; i < num_points * 2; i++) + for (i = sse_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -289,10 +297,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); // Clip @@ -308,12 +318,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, outputVectorPtr += 8; } - for(i = sse_iters * 8; i < num_points * 2; i++) + for (i = sse_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -332,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector int16_t* outputVectorPtr = (int16_t*)outputVector; float aux; unsigned int i; - const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast + const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast const float max_val = (float)SHRT_MAX; __m256 inputVal1, inputVal2; @@ -341,10 +351,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmax_val = _mm256_set1_ps(max_val); - for(i = 0; i < avx2_iters; i++) + for (i = 0; i < avx2_iters; i++) { - inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal1 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); // Clip @@ -361,12 +373,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector outputVectorPtr += 16; } - for(i = avx2_iters * 16; i < num_points * 2; i++) + for (i = avx2_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -397,10 +409,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector, int16x4_t intInputVal1, intInputVal2; int16x8_t res; - for(i = 0; i < neon_iters; i++) + for (i = 0; i < neon_iters; i++) { - a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; - b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)(inputVectorPtr)); + inputVectorPtr += 4; + b = vld1q_f32((const float32_t*)(inputVectorPtr)); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); @@ -425,12 +439,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector, outputVectorPtr += 8; } - for(i = neon_iters * 8; i < num_points * 2; i++) + for (i = neon_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val_f) + if (aux > max_val_f) aux = max_val_f; - else if(aux < min_val_f) + else if (aux < min_val_f) aux = min_val_f; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -449,14 +463,14 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto const float max_val = (float)SHRT_MAX; float aux; unsigned int i; - for(i = 0; i < num_points * 2; i++) + for (i = 0; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; - *outputVectorPtr++ = (int16_t)rintf(aux); + *outputVectorPtr++ = (int16_t)rintf(aux); } } #endif /* LV_HAVE_GENERIC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h old mode 100755 new mode 100644 index ca5f13f22..ab8d32e32 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h @@ -72,12 +72,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const float max_val = (float)SCHAR_MAX; float aux; unsigned int i; - for(i = 0; i < num_points * 2; i++) + for (i = 0; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -107,12 +107,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmax_val = _mm256_set1_ps(max_val); - for(i = 0; i < avx2_iters; i++) + for (i = 0; i < avx2_iters; i++) { - inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); @@ -142,12 +146,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, outputVectorPtr += 32; } - for(i = avx2_iters * 32; i < num_points * 2; i++) + for (i = avx2_iters * 32; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -177,12 +181,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmax_val = _mm256_set1_ps(max_val); - for(i = 0; i < avx2_iters; i++) + for (i = 0; i < avx2_iters; i++) { - inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal1 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal3 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal4 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); @@ -212,12 +220,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, outputVectorPtr += 32; } - for(i = avx2_iters * 32; i < num_points * 2; i++) + for (i = avx2_iters * 32; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -247,12 +255,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; inputVal1 = _mm_mul_ps(inputVal1, vmax_val); inputVal2 = _mm_mul_ps(inputVal2, vmax_val); @@ -278,12 +290,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, outputVectorPtr += 16; } - for(i = sse_iters * 16; i < num_points * 2; i++) + for (i = sse_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -313,12 +325,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal3 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal4 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; inputVal1 = _mm_mul_ps(inputVal1, vmax_val); inputVal2 = _mm_mul_ps(inputVal2, vmax_val); @@ -344,12 +360,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, outputVectorPtr += 16; } - for(i = sse_iters * 16; i < num_points * 2; i++) + for (i = sse_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -383,9 +399,10 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co int8x8_t res8_1, res8_2; int8x16_t outputVal; - for(i = 0; i < neon_iters; i++) + for (i = 0; i < neon_iters; i++) { - a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)inputVectorPtr); + inputVectorPtr += 4; a = vmulq_f32(a, max_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); @@ -394,7 +411,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co toint_a = vcvtq_s32_f32(Round); intInputVal1 = vqmovn_s32(toint_a); - a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)inputVectorPtr); + inputVectorPtr += 4; a = vmulq_f32(a, max_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); @@ -406,7 +424,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2); res8_1 = vqmovn_s16(pack16_8_1); - a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)inputVectorPtr); + inputVectorPtr += 4; a = vmulq_f32(a, max_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); @@ -415,7 +434,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co toint_a = vcvtq_s32_f32(Round); intInputVal1 = vqmovn_s32(toint_a); - a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)inputVectorPtr); + inputVectorPtr += 4; a = vmulq_f32(a, max_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); @@ -433,12 +453,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co outputVectorPtr += 16; } - for(i = neon_iters * 16; i < num_points * 2; i++) + for (i = neon_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val_f; - if(aux > max_val_f) + if (aux > max_val_f) aux = max_val_f; - else if(aux < min_val_f) + else if (aux < min_val_f) aux = min_val_f; *outputVectorPtr++ = (int8_t)rintf(aux); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h index 9348c09fc..1655b5ccd 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h @@ -42,31 +42,30 @@ #include - #ifdef LV_HAVE_GENERIC static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -78,26 +77,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -107,26 +106,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -137,26 +136,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -166,26 +165,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -195,26 +194,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -224,26 +223,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -253,26 +252,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -282,26 +281,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* re static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -311,28 +310,28 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* re static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H +#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h index a25715749..c3c77233a 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h @@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points; n++) { - tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); // Regenerate phase if (n % 256 == 0) @@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( unsigned int j; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) @@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( result[n_vec] += tmp32_2; } } - /* Regenerate phase */ + /* Regenerate phase */ #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -169,7 +169,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t tmp32_1, tmp32_2; const unsigned int sse_iters = num_points / 2; int n_vec; @@ -179,7 +179,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ const lv_32fc_t** _in_a = in_a; const lv_32fc_t* _in_common = in_common; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t dotProductVector[2]; __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); @@ -191,11 +192,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ // phase rotation registers __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -203,12 +206,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg); const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: a = _mm_loadu_ps((float*)_in_common); - // __VOLK_GNSSSDR_PREFETCH(_in_common + 4); - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + // __VOLK_GNSSSDR_PREFETCH(_in_common + 4); + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); tmp1 = _mm_mul_ps(a, yl); tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp); @@ -219,7 +222,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ z1 = _mm_addsub_ps(tmp1, tmp2); two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p); - yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr + yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(z1); //next two samples @@ -227,7 +230,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadu_ps((float*)&(_in_a[n_vec][number*2])); + a = _mm_loadu_ps((float*)&(_in_a[n_vec][number * 2])); tmp1 = _mm_mul_ps(a, yl); a = _mm_shuffle_ps(a, a, 0xB1); tmp2 = _mm_mul_ps(a, yh); @@ -247,8 +250,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 2; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -260,7 +263,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 2; n < num_points; n++) + for (n = sse_iters * 2; n < num_points; n++) { tmp32_1 = in_common[n] * (*phase); (*phase) *= phase_inc; @@ -278,7 +281,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t tmp32_1, tmp32_2; const unsigned int sse_iters = num_points / 2; int n_vec; @@ -288,7 +291,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ const lv_32fc_t** _in_a = in_a; const lv_32fc_t* _in_common = in_common; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t dotProductVector[2]; __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); @@ -300,11 +304,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ // phase rotation registers __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -312,12 +318,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg); const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: a = _mm_load_ps((float*)_in_common); - // __VOLK_GNSSSDR_PREFETCH(_in_common + 4); - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + // __VOLK_GNSSSDR_PREFETCH(_in_common + 4); + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); tmp1 = _mm_mul_ps(a, yl); tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp); @@ -328,7 +334,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ z1 = _mm_addsub_ps(tmp1, tmp2); two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p); - yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr + yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(z1); //next two samples @@ -336,7 +342,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_ps((float*)&(_in_a[n_vec][number*2])); + a = _mm_load_ps((float*)&(_in_a[n_vec][number * 2])); tmp1 = _mm_mul_ps(a, yl); a = _mm_shuffle_ps(a, a, 0xB1); tmp2 = _mm_mul_ps(a, yh); @@ -356,8 +362,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 2; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -369,7 +375,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 2; n < num_points; n++) + for (n = sse_iters * 2; n < num_points; n++) { tmp32_1 = in_common[n] * (*phase); (*phase) *= phase_inc; @@ -387,7 +393,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t tmp32_1, tmp32_2; const unsigned int avx_iters = num_points / 4; int n_vec; @@ -398,7 +404,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t const lv_32fc_t* _in_common = in_common; lv_32fc_t _phase = (*phase); - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); @@ -431,12 +438,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg); const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { // Phase rotation on operand in_common starts here: a = _mm256_loadu_ps((float*)_in_common); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); - yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr + yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(four_phase_acc_reg); tmp1 = _mm256_mul_ps(a, yl); tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp); @@ -447,7 +454,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t z = _mm256_addsub_ps(tmp1, tmp2); four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p); - yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr + yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(z); //next two samples @@ -475,8 +482,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -492,10 +499,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); - _phase = four_phase_acc[0]; + _phase = four_phase_acc[0]; _mm256_zeroupper(); - for(n = avx_iters * 4; n < num_points; n++) + for (n = avx_iters * 4; n < num_points; n++) { tmp32_1 = *_in_common++ * _phase; _phase *= phase_inc; @@ -514,7 +521,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t tmp32_1, tmp32_2; const unsigned int avx_iters = num_points / 4; int n_vec; @@ -525,7 +532,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t const lv_32fc_t* _in_common = in_common; lv_32fc_t _phase = (*phase); - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); @@ -538,7 +546,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t // phase rotation registers __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_inc[4]; const lv_32fc_t phase_inc2 = phase_inc * phase_inc; const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc; const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc; @@ -548,7 +557,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t four_phase_inc[3] = phase_inc4; const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_acc[4]; four_phase_acc[0] = _phase; four_phase_acc[1] = _phase * phase_inc; four_phase_acc[2] = _phase * phase_inc2; @@ -558,12 +568,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg); const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { // Phase rotation on operand in_common starts here: a = _mm256_load_ps((float*)_in_common); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); - yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr + yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(four_phase_acc_reg); tmp1 = _mm256_mul_ps(a, yl); tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp); @@ -574,7 +584,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t z = _mm256_addsub_ps(tmp1, tmp2); four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p); - yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr + yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(z); //next two samples @@ -602,8 +612,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -619,10 +629,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); - _phase = four_phase_acc[0]; + _phase = four_phase_acc[0]; _mm256_zeroupper(); - for(n = avx_iters * 4; n < num_points; n++) + for (n = avx_iters * 4; n < num_points; n++) { tmp32_1 = *_in_common++ * _phase; _phase *= phase_inc; @@ -646,7 +656,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* int n_vec; int i; unsigned int number; - unsigned int n ; + unsigned int n; const lv_32fc_t** _in_a = in_a; const lv_32fc_t* _in_common = in_common; lv_32fc_t* _out = result; @@ -656,36 +666,41 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* if (neon_iters > 0) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); float32_t arg_phase0 = cargf(_phase); float32_t arg_phase_inc = cargf(phase_inc); float32_t phase_est; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - lv_32fc_t phase2 = (lv_32fc_t)(_phase) * phase_inc; + lv_32fc_t phase2 = (lv_32fc_t)(_phase)*phase_inc; lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag; float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment()); float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f); accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f); @@ -693,7 +708,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f); } - for(number = 0; number < neon_iters; number++) + for (number = 0; number < neon_iters; number++) { /* load 4 complex numbers (float 32 bits each component) */ b_val = vld2q_f32((float32_t*)_in_common); @@ -728,8 +743,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -753,8 +770,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* } for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -770,7 +787,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* _phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { tmp32_1 = in_common[n] * _phase; _phase *= phase_inc; @@ -786,4 +803,3 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* #endif /* LV_HAVE_NEON */ #endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h index 3072542cf..846539fc9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h @@ -41,7 +41,7 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -53,14 +53,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_ unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_ #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -83,14 +83,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -101,7 +101,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel #ifdef LV_HAVE_SSE3 -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -113,14 +113,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3 unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -131,7 +131,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3 #ifdef LV_HAVE_SSE3 -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -143,14 +143,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3 unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -161,7 +161,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3 #ifdef LV_HAVE_AVX -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -173,14 +173,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32 unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32 #ifdef LV_HAVE_AVX -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -203,14 +203,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32 unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -221,7 +221,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32 #ifdef LV_HAVE_NEON -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -233,14 +233,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32f unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h index f8db65944..3e6227a17 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h @@ -107,7 +107,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -121,7 +122,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -142,18 +143,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -177,7 +178,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -191,7 +193,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -212,18 +214,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -245,7 +247,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -259,7 +262,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -277,18 +280,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -311,7 +314,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -325,7 +329,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -343,18 +347,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -377,7 +381,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -392,7 +397,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -410,13 +415,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -426,12 +431,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -454,7 +459,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -469,7 +475,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -487,13 +493,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -503,12 +509,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -531,7 +537,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -546,7 +553,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -565,13 +572,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -581,12 +588,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -609,7 +616,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -624,7 +632,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -643,13 +651,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -659,12 +667,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -689,19 +697,21 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int32_t local_code_chip_index[4]; int32_t local_code_chip_index_; const int32x4_t zeros = vdupq_n_s32(0); const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); - int32x4_t local_code_chip_index_reg, aux_i, negatives, i; + int32x4_t local_code_chip_index_reg, aux_i, negatives, i; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; - __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f}; uint32x4_t igx; reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! float32x4_t n0 = vld1q_f32((float*)vec); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) @@ -709,7 +719,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < neon_iters; n++) + for (n = 0; n < neon_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); @@ -725,7 +735,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul // fmod c = vmulq_f32(aux, reciprocal); - i = vcvtq_s32_f32(c); + i = vcvtq_s32_f32(c); cTrunc = vcvtq_f32_s32(i); base = vmulq_f32(cTrunc, code_length_chips_reg_f); aux = vsubq_f32(aux, base); @@ -737,13 +747,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = vaddq_f32(indexn, fours); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h index e1d577c1e..b686b6c5d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h @@ -69,11 +69,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const unsigned int i; const double* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; + __VOLK_ATTR_ALIGNED(32) + double tempBuffer[4]; __m256d accumulator = _mm256_setzero_pd(); __m256d aVal = _mm256_setzero_pd(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm256_loadu_pd(aPtr); accumulator = _mm256_add_pd(accumulator, aVal); @@ -82,12 +83,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const _mm256_storeu_pd((double*)tempBuffer, accumulator); - for(i = 0; i < 4; ++i) + for (i = 0; i < 4; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 4); ++i) + for (i = 0; i < (num_points % 4); ++i) { returnValue += (*aPtr++); } @@ -100,7 +101,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points) +static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result, const double* inputBuffer, unsigned int num_points) { double returnValue = 0; const unsigned int sse_iters = num_points / 2; @@ -108,11 +109,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const unsigned int i; const double* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; + __VOLK_ATTR_ALIGNED(16) + double tempBuffer[2]; __m128d accumulator = _mm_setzero_pd(); __m128d aVal = _mm_setzero_pd(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_loadu_pd(aPtr); accumulator = _mm_add_pd(accumulator, aVal); @@ -121,12 +123,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const _mm_storeu_pd((double*)tempBuffer, accumulator); - for(i = 0; i < 2; ++i) + for (i = 0; i < 2; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 2); ++i) + for (i = 0; i < (num_points % 2); ++i) { returnValue += (*aPtr++); } @@ -138,13 +140,13 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points) +static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result, const double* inputBuffer, unsigned int num_points) { const double* aPtr = inputBuffer; double returnValue = 0; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { returnValue += (*aPtr++); } @@ -156,7 +158,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const #ifdef LV_HAVE_AVX #include -static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points) +static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result, const double* inputBuffer, unsigned int num_points) { double returnValue = 0; const unsigned int sse_iters = num_points / 4; @@ -164,11 +166,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d unsigned int i; const double* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; + __VOLK_ATTR_ALIGNED(32) + double tempBuffer[4]; __m256d accumulator = _mm256_setzero_pd(); __m256d aVal = _mm256_setzero_pd(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm256_load_pd(aPtr); accumulator = _mm256_add_pd(accumulator, aVal); @@ -177,12 +180,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d _mm256_store_pd((double*)tempBuffer, accumulator); - for(i = 0; i < 4; ++i) + for (i = 0; i < 4; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 4); ++i) + for (i = 0; i < (num_points % 4); ++i) { returnValue += (*aPtr++); } @@ -195,7 +198,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points) +static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result, const double* inputBuffer, unsigned int num_points) { double returnValue = 0; const unsigned int sse_iters = num_points / 2; @@ -203,11 +206,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const unsigned int i; const double* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; + __VOLK_ATTR_ALIGNED(16) + double tempBuffer[2]; __m128d accumulator = _mm_setzero_pd(); __m128d aVal = _mm_setzero_pd(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_load_pd(aPtr); accumulator = _mm_add_pd(accumulator, aVal); @@ -216,12 +220,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const _mm_store_pd((double*)tempBuffer, accumulator); - for(i = 0; i < 2; ++i) + for (i = 0; i < 2; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 2); ++i) + for (i = 0; i < (num_points % 2); ++i) { returnValue += (*aPtr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h index 8c2830cdc..9e141c6c4 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h @@ -70,11 +70,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch unsigned int i; const char* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char tempBuffer[16]; __m128i accumulator = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_lddqu_si128((__m128i*)aPtr); accumulator = _mm_add_epi8(accumulator, aVal); @@ -82,12 +83,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch } _mm_storeu_si128((__m128i*)tempBuffer, accumulator); - for(i = 0; i < 16; ++i) + for (i = 0; i < 16; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { returnValue += (*aPtr++); } @@ -104,7 +105,7 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c const char* aPtr = inputBuffer; char returnValue = 0; unsigned int number; - for(number = 0;number < num_points; number++) + for (number = 0; number < num_points; number++) { returnValue += (*aPtr++); } @@ -125,24 +126,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch const char* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char tempBuffer[16]; __m128i accumulator = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_load_si128((__m128i*)aPtr); accumulator = _mm_add_epi8(accumulator, aVal); aPtr += 16; } - _mm_store_si128((__m128i*)tempBuffer,accumulator); + _mm_store_si128((__m128i*)tempBuffer, accumulator); - for(i = 0; i < 16; ++i) + for (i = 0; i < 16; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { returnValue += (*aPtr++); } @@ -164,24 +166,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const ch const char* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char tempBuffer[32]; __m256i accumulator = _mm256_setzero_si256(); __m256i aVal = _mm256_setzero_si256(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm256_load_si256((__m256i*)aPtr); accumulator = _mm256_add_epi8(accumulator, aVal); aPtr += 32; } - _mm256_store_si256((__m256i*)tempBuffer,accumulator); + _mm256_store_si256((__m256i*)tempBuffer, accumulator); - for(i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { returnValue += (*aPtr++); } @@ -202,11 +205,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch unsigned int i; const char* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char tempBuffer[32]; __m256i accumulator = _mm256_setzero_si256(); __m256i aVal = _mm256_setzero_si256(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm256_lddqu_si256((__m256i*)aPtr); accumulator = _mm256_add_epi8(accumulator, aVal); @@ -214,12 +218,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch } _mm256_storeu_si256((__m256i*)tempBuffer, accumulator); - for(i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { returnValue += (*aPtr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h index 1f053f239..2af8c55d9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h @@ -60,11 +60,11 @@ #ifdef LV_HAVE_AVX2 -#include +#include static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int avx2_iters = num_points / 32; unsigned int number; @@ -74,14 +74,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co char max = src0[0]; unsigned int index = 0; unsigned int mask; - __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char currentValuesBuffer[32]; __m256i maxValues, compareResults, currentValues; maxValues = _mm256_set1_epi8(max); - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - currentValues = _mm256_loadu_si256((__m256i*)inputPtr); + currentValues = _mm256_loadu_si256((__m256i*)inputPtr); compareResults = _mm256_cmpgt_epi8(maxValues, currentValues); mask = _mm256_movemask_epi8(compareResults); @@ -94,7 +95,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -108,9 +109,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co inputPtr += 32; } - for(i = 0; i<(num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -128,7 +129,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 32; unsigned int number; @@ -137,33 +138,34 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con char* inputPtr = (char*)src0; char max = src0[0]; unsigned int index = 0; - __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char currentValuesBuffer[32]; __m256i ones, compareResults, currentValues; __m128i compareResultslo, compareResultshi, maxValues, lo, hi; ones = _mm256_set1_epi8(0xFF); maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); + currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); lo = _mm256_castsi256_si128(currentValues); - hi = _mm256_extractf128_si256(currentValues,1); + hi = _mm256_extractf128_si256(currentValues, 1); compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultshi = _mm_cmpgt_epi8(maxValues, hi); //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h - compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); + compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); if (!_mm256_testc_si256(compareResults, ones)) { _mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues); - for(i = 0; i < 32; i++) + for (i = 0; i < 32; i++) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -175,9 +177,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con inputPtr += 32; } - for(i = 0; i<(num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -195,7 +197,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -204,14 +206,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, char* inputPtr = (char*)src0; char max = src0[0]; unsigned int index = 0; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_lddqu_si128((__m128i*)inputPtr); + currentValues = _mm_lddqu_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); @@ -219,9 +222,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, { _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); - for(i = 0; i < 16; i++) + for (i = 0; i < 16; i++) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -233,9 +236,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, inputPtr += 16; } - for(i = 0; i<(num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -249,11 +252,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, #ifdef LV_HAVE_SSE2 -#include +#include static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -263,14 +266,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co char max = src0[0]; unsigned int index = 0; unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_loadu_si128((__m128i*)inputPtr); + currentValues = _mm_loadu_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); mask = _mm_movemask_epi8(compareResults); @@ -283,7 +287,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -297,9 +301,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co inputPtr += 16; } - for(i = 0; i<(num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -316,14 +320,14 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { char max = src0[0]; unsigned int index = 0; unsigned int i; - for(i = 1; i < num_points; ++i) + for (i = 1; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -337,11 +341,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c #ifdef LV_HAVE_AVX2 -#include +#include static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int avx2_iters = num_points / 32; unsigned int number; @@ -351,14 +355,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co char max = src0[0]; unsigned int index = 0; unsigned int mask; - __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char currentValuesBuffer[32]; __m256i maxValues, compareResults, currentValues; maxValues = _mm256_set1_epi8(max); - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - currentValues = _mm256_load_si256((__m256i*)inputPtr); + currentValues = _mm256_load_si256((__m256i*)inputPtr); compareResults = _mm256_cmpgt_epi8(maxValues, currentValues); mask = _mm256_movemask_epi8(compareResults); @@ -371,7 +376,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -385,9 +390,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co inputPtr += 32; } - for(i = 0; i<(num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -405,7 +410,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 32; unsigned int number; @@ -414,19 +419,20 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con char* inputPtr = (char*)src0; char max = src0[0]; unsigned int index = 0; - __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char currentValuesBuffer[32]; __m256i ones, compareResults, currentValues; __m128i compareResultslo, compareResultshi, maxValues, lo, hi; ones = _mm256_set1_epi8(0xFF); maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm256_load_si256((__m256i*)inputPtr); + currentValues = _mm256_load_si256((__m256i*)inputPtr); lo = _mm256_castsi256_si128(currentValues); - hi = _mm256_extractf128_si256(currentValues,1); + hi = _mm256_extractf128_si256(currentValues, 1); compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultshi = _mm_cmpgt_epi8(maxValues, hi); @@ -438,9 +444,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con { _mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues); - for(i = 0; i < 32; i++) + for (i = 0; i < 32; i++) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -452,9 +458,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con inputPtr += 32; } - for(i = 0; i<(num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -472,7 +478,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -481,14 +487,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, char* inputPtr = (char*)src0; char max = src0[0]; unsigned int index = 0; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_load_si128((__m128i*)inputPtr); + currentValues = _mm_load_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); @@ -496,9 +503,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, { _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); - for(i = 0; i < 16; i++) + for (i = 0; i < 16; i++) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -510,9 +517,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, inputPtr += 16; } - for(i = 0; i<(num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -530,7 +537,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -540,14 +547,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co char max = src0[0]; unsigned int index = 0; unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_load_si128((__m128i*)inputPtr); + currentValues = _mm_load_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); mask = _mm_movemask_epi8(compareResults); @@ -560,7 +568,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -574,9 +582,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co inputPtr += 16; } - for(i = 0; i<(num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h index 109c4f779..d748281c3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h @@ -63,21 +63,22 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int avx_iters = num_points / 32; unsigned int number; unsigned int i; char* inputPtr = (char*)src0; char max = src0[0]; - __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char maxValuesBuffer[32]; __m256i maxValues, compareResults, currentValues; maxValues = _mm256_set1_epi8(max); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { - currentValues = _mm256_loadu_si256((__m256i*)inputPtr); + currentValues = _mm256_loadu_si256((__m256i*)inputPtr); compareResults = _mm256_max_epi8(maxValues, currentValues); maxValues = compareResults; inputPtr += 32; @@ -85,17 +86,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0 _mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues); - for(i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { - if(maxValuesBuffer[i] > max) + if (maxValuesBuffer[i] > max) { max = maxValuesBuffer[i]; } } - for(i = avx_iters * 32; i < num_points; ++i) + for (i = avx_iters * 32; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -112,21 +113,22 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0 static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; unsigned int i; char* inputPtr = (char*)src0; char max = src0[0]; - __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char maxValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_loadu_si128((__m128i*)inputPtr); + currentValues = _mm_loadu_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); inputPtr += 16; @@ -134,17 +136,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); - for(i = 0; i < 16; ++i) + for (i = 0; i < 16; ++i) { - if(maxValuesBuffer[i] > max) + if (maxValuesBuffer[i] > max) { max = maxValuesBuffer[i]; } } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -157,11 +159,11 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr #ifdef LV_HAVE_SSE2 -#include +#include static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -169,14 +171,15 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0 char* inputPtr = (char*)src0; char max = src0[0]; unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_loadu_si128((__m128i*)inputPtr); + currentValues = _mm_loadu_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); mask = _mm_movemask_epi8(compareResults); @@ -189,7 +192,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0 { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { max = currentValuesBuffer[i]; } @@ -202,9 +205,9 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0 inputPtr += 16; } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -220,13 +223,13 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0 static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { char max = src0[0]; unsigned int i; - for(i = 1; i < num_points; ++i) + for (i = 1; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -243,21 +246,22 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; unsigned int i; char* inputPtr = (char*)src0; char max = src0[0]; - __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char maxValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_load_si128((__m128i*)inputPtr); + currentValues = _mm_load_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); inputPtr += 16; @@ -265,17 +269,17 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr _mm_store_si128((__m128i*)maxValuesBuffer, maxValues); - for(i = 0; i < 16; ++i) + for (i = 0; i < 16; ++i) { - if(maxValuesBuffer[i] > max) + if (maxValuesBuffer[i] > max) { max = maxValuesBuffer[i]; } } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -292,39 +296,40 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int avx_iters = num_points / 32; unsigned int number; unsigned int i; char* inputPtr = (char*)src0; char max = src0[0]; - __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char maxValuesBuffer[32]; __m256i maxValues, compareResults, currentValues; maxValues = _mm256_set1_epi8(max); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { - currentValues = _mm256_load_si256((__m256i*)inputPtr); + currentValues = _mm256_load_si256((__m256i*)inputPtr); compareResults = _mm256_max_epi8(maxValues, currentValues); - maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults); + maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults); inputPtr += 32; } _mm256_store_si256((__m256i*)maxValuesBuffer, maxValues); - for(i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { - if(maxValuesBuffer[i] > max) + if (maxValuesBuffer[i] > max) { max = maxValuesBuffer[i]; } } - for(i = avx_iters * 32; i < num_points; ++i) + for (i = avx_iters * 32; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -341,7 +346,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0 static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -349,14 +354,15 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0 char* inputPtr = (char*)src0; char max = src0[0]; unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_load_si128((__m128i*)inputPtr); + currentValues = _mm_load_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); mask = _mm_movemask_epi8(compareResults); @@ -369,7 +375,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0 { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { max = currentValuesBuffer[i]; } @@ -382,9 +388,9 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0 inputPtr += 16; } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h index 3854319fd..4d25cf923 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h @@ -72,21 +72,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a __m128i aVal, bVal, cVal; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_loadu_si128((__m128i*)aPtr); bVal = _mm_loadu_si128((__m128i*)bPtr); cVal = _mm_add_epi8(aVal, bVal); - _mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container + _mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container aPtr += 16; bPtr += 16; cPtr += 16; } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { *cPtr++ = (*aPtr++) + (*bPtr++); } @@ -108,21 +108,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* a __m256i aVal, bVal, cVal; - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { aVal = _mm256_loadu_si256((__m256i*)aPtr); bVal = _mm256_loadu_si256((__m256i*)bPtr); cVal = _mm256_add_epi8(aVal, bVal); - _mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container + _mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container aPtr += 32; bPtr += 32; cPtr += 32; } - for(i = avx_iters * 32; i < num_points; ++i) + for (i = avx_iters * 32; i < num_points; ++i) { *cPtr++ = (*aPtr++) + (*bPtr++); } @@ -139,7 +139,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* const char* bPtr = bVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = (*aPtr++) + (*bPtr++); } @@ -161,21 +161,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a __m128i aVal, bVal, cVal; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_load_si128((__m128i*)aPtr); bVal = _mm_load_si128((__m128i*)bPtr); cVal = _mm_add_epi8(aVal, bVal); - _mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container + _mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container aPtr += 16; bPtr += 16; cPtr += 16; } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { *cPtr++ = (*aPtr++) + (*bPtr++); } @@ -197,21 +197,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* a __m256i aVal, bVal, cVal; - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { aVal = _mm256_load_si256((__m256i*)aPtr); bVal = _mm256_load_si256((__m256i*)bPtr); cVal = _mm256_add_epi8(aVal, bVal); - _mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container + _mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container aPtr += 32; bPtr += 32; cPtr += 32; } - for(i = avx_iters * 32; i < num_points; ++i) + for (i = avx_iters * 32; i < num_points; ++i) { *cPtr++ = (*aPtr++) + (*bPtr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h index 830128a83..177b1114d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h @@ -111,10 +111,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const tmp = _mm256_xor_ps(tmp, conjugator1); tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); - tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); + tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h - tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); + tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); _mm256_storeu_ps((float*)c, tmp); a += 16; @@ -155,7 +155,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con { *c++ = lv_conj(*a++); } - } #endif /* LV_HAVE_SSSE3 */ @@ -188,7 +187,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons { *c++ = lv_conj(*a++); } - } #endif /* LV_HAVE_SSE3 */ @@ -201,7 +199,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con const lv_8sc_t* aPtr = aVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = lv_conj(*aPtr++); } @@ -230,10 +228,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const tmp = _mm256_xor_ps(tmp, conjugator1); tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); - tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); + tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h - tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); + tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); _mm256_store_ps((float*)c, tmp); a += 16; @@ -336,7 +334,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons { *c++ = lv_conj(*a++); } - } #endif /* LV_HAVE_SSE3 */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h index 7152b0f29..d9dd67716 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h @@ -78,23 +78,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); - avectorlo = _mm_unpacklo_epi8 (avector, zero); - avectorhi = _mm_unpackhi_epi8 (avector, zero); - avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); - avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); - aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); + avectorlo = _mm_unpacklo_epi8(avector, zero); + avectorhi = _mm_unpackhi_epi8(avector, zero); + avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo); + avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi); + aadded = _mm_hadd_epi16(avectorlomult, avectorhimult); complexVectorPtr += 16; bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); - bvectorlo = _mm_unpacklo_epi8 (bvector, zero); - bvectorhi = _mm_unpackhi_epi8 (bvector, zero); - bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); - bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); - badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); + bvectorlo = _mm_unpacklo_epi8(bvector, zero); + bvectorhi = _mm_unpackhi_epi8(bvector, zero); + bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo); + bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi); + badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult); complexVectorPtr += 16; @@ -162,11 +162,11 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude const char* complexVectorPtr = (char*)complexVector; char* magnitudeVectorPtr = magnitudeVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { const char real = *complexVectorPtr++; const char imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); + *magnitudeVectorPtr++ = (real * real) + (imag * imag); } } #endif /* LV_HAVE_GENERIC */ @@ -192,23 +192,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { avector = _mm_load_si128((__m128i*)complexVectorPtr); - avectorlo = _mm_unpacklo_epi8 (avector, zero); - avectorhi = _mm_unpackhi_epi8 (avector, zero); - avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); - avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); - aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); + avectorlo = _mm_unpacklo_epi8(avector, zero); + avectorhi = _mm_unpackhi_epi8(avector, zero); + avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo); + avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi); + aadded = _mm_hadd_epi16(avectorlomult, avectorhimult); complexVectorPtr += 16; bvector = _mm_load_si128((__m128i*)complexVectorPtr); - bvectorlo = _mm_unpacklo_epi8 (bvector, zero); - bvectorhi = _mm_unpackhi_epi8 (bvector, zero); - bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); - bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); - badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); + bvectorlo = _mm_unpacklo_epi8(bvector, zero); + bvectorhi = _mm_unpackhi_epi8(bvector, zero); + bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo); + bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi); + badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult); complexVectorPtr += 16; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h index 21b1abb1b..3c949b3db 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h @@ -80,7 +80,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, imagy = _mm_and_si128(imagy, mult1); realy = _mm_and_si128(y, mult1); - for(; number < sse_iters; number++) + for (; number < sse_iters; number++) { x = _mm_lddqu_si128((__m128i*)a); @@ -111,7 +111,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, { *c++ = (*a++) * scalar; } - } #endif /* LV_HAVE_SSE3 */ @@ -173,7 +172,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, imagy = _mm_and_si128(imagy, mult1); realy = _mm_and_si128(y, mult1); - for(; number < sse_iters; number++) + for (; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); @@ -204,7 +203,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, { *c++ = (*a++) * scalar; } - } #endif /* LV_HAVE_SSE3 */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h index e9633d682..88a689f8b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h @@ -75,17 +75,17 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co *cPtr += (*aPtr++) * (*bPtr++); }*/ - char * res = (char*) result; - char * in = (char*) in_a; - char * tp = (char*) in_b; - unsigned int n_2_ccomplex_blocks = num_points/2; + char* res = (char*)result; + char* in = (char*)in_a; + char* tp = (char*)in_b; + unsigned int n_2_ccomplex_blocks = num_points / 2; unsigned int isodd = num_points & 1; - char sum0[2] = {0,0}; - char sum1[2] = {0,0}; + char sum0[2] = {0, 0}; + char sum1[2] = {0, 0}; unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) + for (i = 0; i < n_2_ccomplex_blocks; ++i) { sum0[0] += in[0] * tp[0] - in[1] * tp[1]; sum0[1] += in[0] * tp[1] + in[1] * tp[0]; @@ -100,7 +100,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co res[1] = sum0[1] + sum1[1]; // Cleanup if we had an odd number of points - for(i = 0; i < isodd; ++i) + for (i = 0; i < isodd; ++i) { *result += in_a[num_points - 1] * in_b[num_points - 1]; } @@ -115,13 +115,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(char)); + memset(&dotProduct, 0x0, 2 * sizeof(char)); unsigned int number; unsigned int i; const lv_8sc_t* a = in_a; const lv_8sc_t* b = in_b; - const unsigned int sse_iters = num_points/8; + const unsigned int sse_iters = num_points / 8; if (sse_iters > 0) { @@ -131,7 +131,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_loadu_si128((__m128i*)a); y = _mm_loadu_si128((__m128i*)b); @@ -165,9 +165,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con totalc = _mm_or_si128(realcacc, imagcacc); - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t dotProductVector[8]; - _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector + _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector for (i = 0; i < 8; ++i) { @@ -192,13 +193,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(char)); + memset(&dotProduct, 0x0, 2 * sizeof(char)); unsigned int number; unsigned int i; const lv_8sc_t* a = in_a; const lv_8sc_t* b = in_b; - const unsigned int sse_iters = num_points/8; + const unsigned int sse_iters = num_points / 8; if (sse_iters > 0) { @@ -208,7 +209,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_lddqu_si128((__m128i*)a); y = _mm_lddqu_si128((__m128i*)b); @@ -236,13 +237,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c b += 8; } - imagcacc = _mm_slli_si128 (imagcacc, 1); + imagcacc = _mm_slli_si128(imagcacc, 1); - totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); + totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1); - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t dotProductVector[8]; - _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector + _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector for (i = 0; i < 8; ++i) { @@ -267,13 +269,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(char)); + memset(&dotProduct, 0x0, 2 * sizeof(char)); unsigned int number; unsigned int i; const lv_8sc_t* a = in_a; const lv_8sc_t* b = in_b; - const unsigned int sse_iters = num_points/8; + const unsigned int sse_iters = num_points / 8; if (sse_iters > 0) { @@ -283,7 +285,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); @@ -317,9 +319,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con totalc = _mm_or_si128(realcacc, imagcacc); - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t dotProductVector[8]; - _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector + _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector for (i = 0; i < 8; ++i) { @@ -343,7 +346,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(char)); + memset(&dotProduct, 0x0, 2 * sizeof(char)); unsigned int number; unsigned int i; const lv_8sc_t* a = in_a; @@ -359,7 +362,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); @@ -387,13 +390,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c b += 8; } - imagcacc = _mm_slli_si128 (imagcacc, 1); + imagcacc = _mm_slli_si128(imagcacc, 1); - totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); + totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1); - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t dotProductVector[8]; - _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector + _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector for (i = 0; i < 8; ++i) { @@ -438,22 +442,23 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, cons static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - dotProduct = lv_cmake(0,0); - *result = lv_cmake(0,0); + dotProduct = lv_cmake(0, 0); + *result = lv_cmake(0, 0); const lv_8sc_t* a = in_a; const lv_8sc_t* b = in_b; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t accum_result[8] = { lv_cmake(0,0) }; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t accum_result[8] = {lv_cmake(0, 0)}; accumulator.val[0] = vdup_n_s8(0); accumulator.val[1] = vdup_n_s8(0); unsigned int number; const unsigned int neon_iters = num_points / 8; - for(number = 0; number < neon_iters; ++number) + for (number = 0; number < neon_iters; ++number) { a_val = vld2_s8((const int8_t*)a); b_val = vld2_s8((const int8_t*)b); @@ -478,7 +483,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const b += 8; } vst2_s8((int8_t*)accum_result, accumulator); - for(number = 0; number < 8; ++number) + for (number = 0; number < 8; ++number) { *result += accum_result[number]; } @@ -490,6 +495,6 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const *result += dotProduct; } -#endif /* LV_HAVE_NEON */ +#endif /* LV_HAVE_NEON */ #endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h index 1b3fd5532..0d8c1d6b3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h @@ -75,7 +75,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_loadu_si128((__m128i*)a); y = _mm_loadu_si128((__m128i*)b); @@ -133,7 +133,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, _mm_setzero_si128(); mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_lddqu_si128((__m128i*)a); y = _mm_lddqu_si128((__m128i*)b); @@ -181,7 +181,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c const lv_8sc_t* bPtr = bVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = (*aPtr++) * (*bPtr++); } @@ -204,7 +204,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); @@ -228,7 +228,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co imagc = _mm_and_si128(imagc, mult1); imagc = _mm_slli_si128(imagc, 1); - totalc = _mm_or_si128 (realc, imagc); + totalc = _mm_or_si128(realc, imagc); _mm_store_si128((__m128i*)c, totalc); @@ -262,7 +262,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, _mm_setzero_si128(); mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h index 8457b7f14..e953954f0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h @@ -72,7 +72,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c const unsigned char* a = aChar; const unsigned char* b = bChar; - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { x = _mm256_loadu_si256((__m256i*)a); y = _mm256_loadu_si256((__m256i*)b); @@ -101,7 +101,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c c += 32; } - for (i = avx2_iters * 32; i < num_points ; ++i) + for (i = avx2_iters * 32; i < num_points; ++i) { *c++ = (*a++) * (*b++); } @@ -123,7 +123,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c const unsigned char* a = aChar; const unsigned char* b = bChar; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_lddqu_si128((__m128i*)a); y = _mm_lddqu_si128((__m128i*)b); @@ -152,7 +152,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c c += 16; } - for (i = sse_iters * 16; i < num_points ; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { *c++ = (*a++) * (*b++); } @@ -168,7 +168,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* bPtr = bChar; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = (*aPtr++) * (*bPtr++); } @@ -189,7 +189,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c const unsigned char* a = aChar; const unsigned char* b = bChar; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); @@ -240,7 +240,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c const unsigned char* a = aChar; const unsigned char* b = bChar; - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { x = _mm256_load_si256((__m256i*)a); y = _mm256_load_si256((__m256i*)b); @@ -269,7 +269,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c c += 32; } - for (i = avx2_iters * 32; i < num_points ; ++i) + for (i = avx2_iters * 32; i < num_points; ++i) { *c++ = (*a++) * (*b++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h index e7e1153e3..d6d58e4d0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h @@ -71,9 +71,9 @@ #include /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ -static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int sse_iters = num_points / 4; unsigned int number = 0; @@ -84,44 +84,44 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl __m128i emm0, emm2, emm4; /* declare some SSE constants */ - static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - static const int _pi32_1[4] = { 1, 1, 1, 1 }; - static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; - static const int _pi32_2[4] = { 2, 2, 2, 2}; - static const int _pi32_4[4] = { 4, 4, 4, 4}; + static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + static const int _pi32_1[4] = {1, 1, 1, 1}; + static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1}; + static const int _pi32_2[4] = {2, 2, 2, 2}; + static const int _pi32_4[4] = {4, 4, 4, 4}; - static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625}; + static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f}; - float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; - float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc }; + float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc}; + float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc}; four_phases_reg = _mm_load_ps(four_phases); const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc); - for(;number < sse_iters; number++) + for (; number < sse_iters; number++) { x = four_phases_reg; sign_bit_sin = x; /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); + x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); + sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask); /* scale by 4/Pi */ - y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); + y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI); /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); @@ -145,9 +145,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl /* The magic pass: "Extended precision modular arithmetic” x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m128*)_ps_minus_cephes_DP1; - xmm2 = *(__m128*)_ps_minus_cephes_DP2; - xmm3 = *(__m128*)_ps_minus_cephes_DP3; + xmm1 = *(__m128 *)_ps_minus_cephes_DP1; + xmm2 = *(__m128 *)_ps_minus_cephes_DP2; + xmm3 = *(__m128 *)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); @@ -163,25 +163,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x,x); - y = *(__m128*)_ps_coscof_p0; + __m128 z = _mm_mul_ps(x, x); + y = *(__m128 *)_ps_coscof_p0; y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); + y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1); y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); + y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); - __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); + __m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5); y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(__m128*)_ps_1); + y = _mm_add_ps(y, *(__m128 *)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - __m128 y2 = *(__m128*)_ps_sincof_p0; + __m128 y2 = *(__m128 *)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); + y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); + y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); @@ -190,11 +190,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); + y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); /* update the sign */ sine = _mm_xor_ps(xmm1, sign_bit_sin); @@ -202,19 +202,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl /* write the output */ aux = _mm_unpacklo_ps(cosine, sine); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(cosine, sine); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg); } _phase = _phase + phase_inc * (sse_iters * 4); - for(number = sse_iters * 4; number < num_points; number++) + for (number = sse_iters * 4; number < num_points; number++) { - *bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase)) ); + *bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase))); _phase += phase_inc; } (*phase) = _phase; @@ -227,9 +227,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl #include /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ -static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int sse_iters = num_points / 4; unsigned int number = 0; @@ -241,44 +241,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl __m128i emm0, emm2, emm4; /* declare some SSE constants */ - __VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_1[4] = {1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_2[4] = {2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_4[4] = {4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f}; - __VOLK_ATTR_ALIGNED(16) float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; - __VOLK_ATTR_ALIGNED(16) float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc }; + __VOLK_ATTR_ALIGNED(16) + float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc}; + __VOLK_ATTR_ALIGNED(16) + float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc}; four_phases_reg = _mm_load_ps(four_phases); const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc); - for(;number < sse_iters; number++) + for (; number < sse_iters; number++) { x = four_phases_reg; sign_bit_sin = x; /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); + x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); + sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask); /* scale by 4/Pi */ - y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); + y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI); /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); @@ -302,9 +322,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl /* The magic pass: "Extended precision modular arithmetic” x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m128*)_ps_minus_cephes_DP1; - xmm2 = *(__m128*)_ps_minus_cephes_DP2; - xmm3 = *(__m128*)_ps_minus_cephes_DP3; + xmm1 = *(__m128 *)_ps_minus_cephes_DP1; + xmm2 = *(__m128 *)_ps_minus_cephes_DP2; + xmm3 = *(__m128 *)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); @@ -320,25 +340,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x,x); - y = *(__m128*)_ps_coscof_p0; + __m128 z = _mm_mul_ps(x, x); + y = *(__m128 *)_ps_coscof_p0; y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); + y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1); y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); + y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); - __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); + __m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5); y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(__m128*)_ps_1); + y = _mm_add_ps(y, *(__m128 *)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - __m128 y2 = *(__m128*)_ps_sincof_p0; + __m128 y2 = *(__m128 *)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); + y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); + y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); @@ -347,11 +367,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); + y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); /* update the sign */ sine = _mm_xor_ps(xmm1, sign_bit_sin); @@ -359,19 +379,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl /* write the output */ aux = _mm_unpacklo_ps(cosine, sine); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(cosine, sine); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg); } _phase = _phase + phase_inc * (sse_iters * 4); - for(number = sse_iters * 4; number < num_points; number++) + for (number = sse_iters * 4; number < num_points; number++) { - *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; @@ -382,13 +402,13 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { float _phase = (*phase); unsigned int i; - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { - *out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + *out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; @@ -400,7 +420,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const f #ifdef LV_HAVE_GENERIC #include #include -static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { float _in, s, c; unsigned int i; @@ -413,12 +433,12 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co const int32_t diffbits = bitlength - Nbits; uint32_t ux; float _phase = (*phase); - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { _in = _phase; d = (int32_t)floor(_in / TWO_PI + 0.5); _in -= d * TWO_PI; - x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI); + x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI); ux = x; sin_index = ux >> diffbits; @@ -428,7 +448,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co cos_index = ux >> diffbits; c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1]; - *out++ = lv_cmake((float)c, (float)s ); + *out++ = lv_cmake((float)c, (float)s); _phase += phase_inc; } (*phase) = _phase; @@ -441,9 +461,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co #include /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ -static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int avx_iters = num_points / 8; unsigned int number = 0; @@ -456,44 +476,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl __m128 aux, c1, s1; /* declare some AXX2 constants */ - __VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(32) + static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(32) + static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 }; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - __VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; - __VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; + __VOLK_ATTR_ALIGNED(32) + float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc}; + __VOLK_ATTR_ALIGNED(32) + float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc}; eight_phases_reg = _mm256_load_ps(eight_phases); const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); - for(;number < avx_iters; number++) + for (; number < avx_iters; number++) { x = eight_phases_reg; sign_bit_sin = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask); + x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask); + sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask); /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI); + y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI); /* store the integer part of y in emm2 */ emm2 = _mm256_cvttps_epi32(y); @@ -517,9 +557,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl /* The magic pass: "Extended precision modular arithmetic” x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m256*)_ps_minus_cephes_DP1; - xmm2 = *(__m256*)_ps_minus_cephes_DP2; - xmm3 = *(__m256*)_ps_minus_cephes_DP3; + xmm1 = *(__m256 *)_ps_minus_cephes_DP1; + xmm2 = *(__m256 *)_ps_minus_cephes_DP2; + xmm3 = *(__m256 *)_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -536,24 +576,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m256 z = _mm256_mul_ps(x, x); - y = *(__m256*)_ps_coscof_p0; + y = *(__m256 *)_ps_coscof_p0; y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1); + y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2); + y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5); + __m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5); y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(__m256*)_ps_1); + y = _mm256_add_ps(y, *(__m256 *)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - __m256 y2 = *(__m256*)_ps_sincof_p0; + __m256 y2 = *(__m256 *)_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1); + y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2); + y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -576,27 +616,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl s1 = _mm256_extractf128_ps(sine, 0); c1 = _mm256_extractf128_ps(cosine, 0); aux = _mm_unpacklo_ps(c1, s1); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(c1, s1); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; s1 = _mm256_extractf128_ps(sine, 1); c1 = _mm256_extractf128_ps(cosine, 1); aux = _mm_unpacklo_ps(c1, s1); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(c1, s1); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg); } _mm256_zeroupper(); _phase = _phase + phase_inc * (avx_iters * 8); - for(number = avx_iters * 8; number < num_points; number++) + for (number = avx_iters * 8; number < num_points; number++) { - out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; @@ -609,9 +649,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl #include /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ -static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int avx_iters = num_points / 8; unsigned int number = 0; @@ -624,44 +664,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl __m128 aux, c1, s1; /* declare some AXX2 constants */ - __VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(32) + static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(32) + static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 }; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - __VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; - __VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; + __VOLK_ATTR_ALIGNED(32) + float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc}; + __VOLK_ATTR_ALIGNED(32) + float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc}; eight_phases_reg = _mm256_load_ps(eight_phases); const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); - for(;number < avx_iters; number++) + for (; number < avx_iters; number++) { x = eight_phases_reg; sign_bit_sin = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask); + x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask); + sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask); /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI); + y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI); /* store the integer part of y in emm2 */ emm2 = _mm256_cvttps_epi32(y); @@ -685,9 +745,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl /* The magic pass: "Extended precision modular arithmetic” x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m256*)_ps_minus_cephes_DP1; - xmm2 = *(__m256*)_ps_minus_cephes_DP2; - xmm3 = *(__m256*)_ps_minus_cephes_DP3; + xmm1 = *(__m256 *)_ps_minus_cephes_DP1; + xmm2 = *(__m256 *)_ps_minus_cephes_DP2; + xmm3 = *(__m256 *)_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -704,24 +764,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m256 z = _mm256_mul_ps(x, x); - y = *(__m256*)_ps_coscof_p0; + y = *(__m256 *)_ps_coscof_p0; y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1); + y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2); + y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5); + __m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5); y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(__m256*)_ps_1); + y = _mm256_add_ps(y, *(__m256 *)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - __m256 y2 = *(__m256*)_ps_sincof_p0; + __m256 y2 = *(__m256 *)_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1); + y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2); + y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -744,27 +804,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl s1 = _mm256_extractf128_ps(sine, 0); c1 = _mm256_extractf128_ps(cosine, 0); aux = _mm_unpacklo_ps(c1, s1); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(c1, s1); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; s1 = _mm256_extractf128_ps(sine, 1); c1 = _mm256_extractf128_ps(cosine, 1); aux = _mm_unpacklo_ps(c1, s1); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(c1, s1); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg); } _mm256_zeroupper(); _phase = _phase + phase_inc * (avx_iters * 8); - for(number = avx_iters * 8; number < num_points; number++) + for (number = avx_iters * 8; number < num_points; number++) { - out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; @@ -777,15 +837,17 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl #include /* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ -static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int neon_iters = num_points / 4; float _phase = (*phase); - __VOLK_ATTR_ALIGNED(16) float32_t four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; + __VOLK_ATTR_ALIGNED(16) + float32_t four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc}; float four_inc = 4 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t four_phases_inc[4] = { four_inc, four_inc, four_inc, four_inc }; + __VOLK_ATTR_ALIGNED(16) + float32_t four_phases_inc[4] = {four_inc, four_inc, four_inc, four_inc}; float32x4_t four_phases_reg = vld1q_f32(four_phases); float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc); @@ -808,7 +870,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos; - for(;number < neon_iters; number++) + for (; number < neon_iters; number++) { x = four_phases_reg; @@ -847,7 +909,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa /* Evaluate the first polynom (0 <= x <= Pi/4) in y1, and the second polynom (Pi/4 <= x <= 0) in y2 */ - z = vmulq_f32(x,x); + z = vmulq_f32(x, x); y1 = vmulq_n_f32(z, c_coscof_p0); y2 = vmulq_n_f32(z, c_sincof_p0); @@ -871,16 +933,16 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa result.val[1] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys); result.val[0] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc)); - vst2q_f32((float32_t*)bPtr, result); + vst2q_f32((float32_t *)bPtr, result); bPtr += 4; four_phases_reg = vaddq_f32(four_phases_reg, four_phases_inc_reg); } _phase = _phase + phase_inc * (neon_iters * 4); - for(number = neon_iters * 4; number < num_points; number++) + for (number = neon_iters * 4; number < num_points; number++) { - *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h index 07d3bf5d2..e4f7c942f 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h @@ -49,7 +49,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic(lv_32fc_t* out, c volk_gnsssdr_s32f_sincos_32fc_generic(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_GENERIC */ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC @@ -60,7 +60,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic_fxpt(lv_32fc_t* o volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_GENERIC */ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE2 @@ -70,7 +70,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_sse2(lv_32fc_t* out, co phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_a_sse2(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_SSE2 */ +#endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_SSE2 @@ -80,7 +80,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_sse2(lv_32fc_t* out, co phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_u_sse2(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_SSE2 */ +#endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_AVX2 @@ -90,7 +90,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_avx2(lv_32fc_t* out, co phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_a_avx2(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_AVX2 @@ -100,7 +100,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_avx2(lv_32fc_t* out, co phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_u_avx2(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_NEON @@ -110,6 +110,6 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_neon(lv_32fc_t* out, cons phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_neon(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_NEON */ +#endif /* LV_HAVE_NEON */ -#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */ +#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h index 5861d052f..733ca74bb 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h @@ -38,32 +38,31 @@ // for puppets we need to get all the func_variants for the puppet and just // keep track of the actual function name to write to results -#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\ - volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\ - std::string(#puppet_master_func), test_params) +#define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \ + volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \ + std::string(#puppet_master_func), test_params) -#define VOLK_INIT_TEST(func, test_params)\ - volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\ - test_params) +#define VOLK_INIT_TEST(func, test_params) \ + volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \ + test_params) #define QA(test) test_cases.push_back(test); std::vector init_test_list(volk_gnsssdr_test_params_t test_params) { - // Some kernels need a lower tolerance volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); volk_gnsssdr_test_params_t test_params_int1 = volk_gnsssdr_test_params_t(1, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); // some others need more iterations ***** ADDED BY GNSS-SDR volk_gnsssdr_test_params_t test_params_more_iters = volk_gnsssdr_test_params_t(test_params.tol(), test_params.scalar(), - test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex()); // ... or more tolerance ***** ADDED BY GNSS-SDR volk_gnsssdr_test_params_t test_params_int16 = volk_gnsssdr_test_params_t(16, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); volk_gnsssdr_test_params_t test_params_inacc2 = volk_gnsssdr_test_params_t(2e-1, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); std::vector test_cases; @@ -98,8 +97,7 @@ std::vector init_test_list(volk_gnsssdr_test_params_t QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16)) QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16)) QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1)) - QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1)) - ; + QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1)); return test_cases; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc index 35e60b2f4..18a4919e2 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc @@ -17,38 +17,39 @@ */ #include "qa_utils.h" -#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t -#include "volk_gnsssdr/volk_gnsssdr_malloc.h" // for volk_gnsssdr_free, volk_gnsssdr_malloc -#include // for auto_any_base -#include // for lexical_cast -#include // for char_separator -#include // for token_iterator -#include // for tokenizer -#include // for assert -#include // for system_clock, duration,... -#include // for sqrt, fabs, abs -#include // for uint16_t, uint64_t,int16_t, int32_t -#include // for memcpy, memset -#include // for operator<< -#include // for cout, cerr -#include // for numeric_limits -#include // for map -#include // for random_device, default_random_engine, uniform_real_distribution -#include // for vector +#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t +#include "volk_gnsssdr/volk_gnsssdr_malloc.h" // for volk_gnsssdr_free, volk_gnsssdr_malloc +#include // for auto_any_base +#include // for lexical_cast +#include // for char_separator +#include // for token_iterator +#include // for tokenizer +#include // for assert +#include // for system_clock, duration,... +#include // for sqrt, fabs, abs +#include // for uint16_t, uint64_t,int16_t, int32_t +#include // for memcpy, memset +#include // for operator<< +#include // for cout, cerr +#include // for numeric_limits +#include // for map +#include // for random_device, default_random_engine, uniform_real_distribution +#include // for vector -float uniform() { +float uniform() +{ std::random_device r; std::default_random_engine e1(r()); std::uniform_real_distribution uniform_dist(-1, 1); - return uniform_dist(e1); // uniformly (-1, 1) + return uniform_dist(e1); // uniformly (-1, 1) } template -void random_floats (t *buf, unsigned n) +void random_floats(t *buf, unsigned n) { - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); + for (unsigned i = 0; i < n; i++) + buf[i] = uniform(); } void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n) @@ -56,60 +57,73 @@ void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n) std::random_device r; std::default_random_engine e2(r()); - if(type.is_complex) n *= 2; + if (type.is_complex) n *= 2; - if(type.is_float) + if (type.is_float) { - if(type.size == 8) random_floats((double *)data, n); - else random_floats((float *)data, n); + if (type.size == 8) + random_floats((double *)data, n); + else + random_floats((float *)data, n); } else { - float int_max = float(uint64_t(2) << (type.size*8)); - if(type.is_signed) int_max /= 2.0; + float int_max = float(uint64_t(2) << (type.size * 8)); + if (type.is_signed) int_max /= 2.0; std::uniform_real_distribution uniform_dist(-int_max, int_max); - for(unsigned int i = 0; i < n; i++) + for (unsigned int i = 0; i < n; i++) { float scaled_rand = uniform_dist(e2); - switch(type.size) - { - case 8: - if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand; - else ((uint64_t *)data)[i] = (uint64_t) scaled_rand; - break; - case 4: - if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; - else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; - break; - case 2: - // 16 bit multiplication saturates very fast - // we produce here only 3 bits input range - if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 8); - else ((uint16_t *)data)[i] = (uint16_t) (int16_t)((int16_t) scaled_rand % 8); - break; - case 1: - if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand; - else ((uint8_t *)data)[i] = (uint8_t) scaled_rand; - break; - default: - throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here - } + switch (type.size) + { + case 8: + if (type.is_signed) + ((int64_t *)data)[i] = (int64_t)scaled_rand; + else + ((uint64_t *)data)[i] = (uint64_t)scaled_rand; + break; + case 4: + if (type.is_signed) + ((int32_t *)data)[i] = (int32_t)scaled_rand; + else + ((uint32_t *)data)[i] = (uint32_t)scaled_rand; + break; + case 2: + // 16 bit multiplication saturates very fast + // we produce here only 3 bits input range + if (type.is_signed) + ((int16_t *)data)[i] = (int16_t)((int16_t)scaled_rand % 8); + else + ((uint16_t *)data)[i] = (uint16_t)(int16_t)((int16_t)scaled_rand % 8); + break; + case 1: + if (type.is_signed) + ((int8_t *)data)[i] = (int8_t)scaled_rand; + else + ((uint8_t *)data)[i] = (uint8_t)scaled_rand; + break; + default: + throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here + } } } } -static std::vector get_arch_list(volk_gnsssdr_func_desc_t desc) { +static std::vector get_arch_list(volk_gnsssdr_func_desc_t desc) +{ std::vector archlist; - for(size_t i = 0; i < desc.n_impls; i++) { - archlist.push_back(std::string(desc.impl_names[i])); - } + for (size_t i = 0; i < desc.n_impls; i++) + { + archlist.push_back(std::string(desc.impl_names[i])); + } return archlist; } -volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) { +volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) +{ volk_gnsssdr_type_t type; type.is_float = false; type.is_scalar = false; @@ -118,52 +132,58 @@ volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) { type.size = 0; type.str = name; - if(name.size() < 2) { - throw std::string("name too short to be a datatype"); - } + if (name.size() < 2) + { + throw std::string("name too short to be a datatype"); + } //is it a scalar? - if(name[0] == 's') { - type.is_scalar = true; - name = name.substr(1, name.size()-1); - } + if (name[0] == 's') + { + type.is_scalar = true; + name = name.substr(1, name.size() - 1); + } //get the data size size_t last_size_pos = name.find_last_of("0123456789"); - if(last_size_pos == std::string::npos) { - throw std::string("no size spec in type ").append(name); - } + if (last_size_pos == std::string::npos) + { + throw std::string("no size spec in type ").append(name); + } //will throw if malformed - int size = boost::lexical_cast(name.substr(0, last_size_pos+1)); + int size = boost::lexical_cast(name.substr(0, last_size_pos + 1)); assert(((size % 8) == 0) && (size <= 64) && (size != 0)); - type.size = size/8; //in bytes + type.size = size / 8; //in bytes - for(size_t i=last_size_pos+1; i < name.size(); i++) { - switch (name[i]) { - case 'f': - type.is_float = true; - break; - case 'i': - type.is_signed = true; - break; - case 'c': - type.is_complex = true; - break; - case 'u': - type.is_signed = false; - break; - default: - throw; + for (size_t i = last_size_pos + 1; i < name.size(); i++) + { + switch (name[i]) + { + case 'f': + type.is_float = true; + break; + case 'i': + type.is_signed = true; + break; + case 'c': + type.is_complex = true; + break; + case 'u': + type.is_signed = false; + break; + default: + throw; + } } - } return type; } static void get_signatures_from_name(std::vector &inputsig, - std::vector &outputsig, - std::string name) { + std::vector &outputsig, + std::string name) +{ boost::char_separator sep("_"); boost::tokenizer > tok(name, sep); std::vector toked; @@ -176,233 +196,282 @@ static void get_signatures_from_name(std::vector &inputsig, //ok. we're assuming a string in the form //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) - enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; + enum + { + SIDE_INPUT, + SIDE_NAME, + SIDE_OUTPUT + } side = SIDE_INPUT; std::string fn_name; volk_gnsssdr_type_t type; - BOOST_FOREACH(std::string token, toked) { - try { - type = volk_gnsssdr_type_from_string(token); - if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... + BOOST_FOREACH (std::string token, toked) + { + try + { + type = volk_gnsssdr_type_from_string(token); + if (side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... - if(side == SIDE_INPUT) inputsig.push_back(type); - else outputsig.push_back(type); - } catch (...){ - if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' || token[1] < '9')) { - if(side == SIDE_INPUT) assert(inputsig.size() > 0); - else assert(outputsig.size() > 0); - int multiplier = boost::lexical_cast(token.substr(1, token.size()-1)); //will throw if invalid /////////// - for(int i=1; i 1) && (token[1] > '0' || token[1] < '9')) + { + if (side == SIDE_INPUT) + assert(inputsig.size() > 0); + else + assert(outputsig.size() > 0); + int multiplier = boost::lexical_cast(token.substr(1, token.size() - 1)); //will throw if invalid /////////// + for (int i = 1; i < multiplier; i++) + { + if (side == SIDE_INPUT) + inputsig.push_back(inputsig.back()); + else + outputsig.push_back(outputsig.back()); + } + } - else if(side == SIDE_INPUT) { //it's the function name, at least it better be - side = SIDE_NAME; - fn_name.append("_"); - fn_name.append(token); - } - else if(side == SIDE_OUTPUT) { - if(token != toked.back()) throw; //the last token in the name is the alignment - } + else if (side == SIDE_INPUT) + { //it's the function name, at least it better be + side = SIDE_NAME; + fn_name.append("_"); + fn_name.append(token); + } + else if (side == SIDE_OUTPUT) + { + if (token != toked.back()) throw; //the last token in the name is the alignment + } + } } - } //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! assert(inputsig.size() != 0); - } -inline void run_cast_test1(volk_gnsssdr_fn_1arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], vlen, arch.c_str()); +inline void run_cast_test1(volk_gnsssdr_fn_1arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], vlen, arch.c_str()); } -inline void run_cast_test2(volk_gnsssdr_fn_2arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); +inline void run_cast_test2(volk_gnsssdr_fn_2arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); } -inline void run_cast_test3(volk_gnsssdr_fn_3arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); +inline void run_cast_test3(volk_gnsssdr_fn_3arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); } -inline void run_cast_test4(volk_gnsssdr_fn_4arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); +inline void run_cast_test4(volk_gnsssdr_fn_4arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); } -inline void run_cast_test1_s32f(volk_gnsssdr_fn_1arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +inline void run_cast_test1_s32f(volk_gnsssdr_fn_1arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } -inline void run_cast_test2_s32f(volk_gnsssdr_fn_2arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +inline void run_cast_test2_s32f(volk_gnsssdr_fn_2arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } -inline void run_cast_test3_s32f(volk_gnsssdr_fn_3arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +inline void run_cast_test3_s32f(volk_gnsssdr_fn_3arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } -inline void run_cast_test1_s32fc(volk_gnsssdr_fn_1arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +inline void run_cast_test1_s32fc(volk_gnsssdr_fn_1arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } -inline void run_cast_test2_s32fc(volk_gnsssdr_fn_2arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +inline void run_cast_test2_s32fc(volk_gnsssdr_fn_2arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } -inline void run_cast_test3_s32fc(volk_gnsssdr_fn_3arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +inline void run_cast_test3_s32fc(volk_gnsssdr_fn_3arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } // *************** ADDED BY GNSS-SDR. START inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } inline void run_cast_test1_s16ic(volk_gnsssdr_fn_1arg_s16ic func, std::vector &buffs, lv_16sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } inline void run_cast_test2_s16ic(volk_gnsssdr_fn_2arg_s16ic func, std::vector &buffs, lv_16sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } inline void run_cast_test3_s16ic(volk_gnsssdr_fn_3arg_s16ic func, std::vector &buffs, lv_16sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } // *************** ADDED BY GNSS-SDR. END template -bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { +bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) +{ bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i tol ) - { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); - std::cout << " tolerance was: " << tol << std::endl; + for (unsigned int i = 0; i < vlen; i++) + { + // for very small numbers we'll see round off errors due to limited + // precision. So a special test case... + if (fabs(((t *)(in1))[i]) < 1e-30) + { + if (fabs(((t *)(in2))[i]) > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); + std::cout << " tolerance was: " << tol << std::endl; + } + } + } + // the primary test is the percent different greater than given tol + else if (fabs(((t *)(in1))[i] - ((t *)(in2))[i]) / fabs(((t *)in1)[i]) > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); + std::cout << " tolerance was: " << tol << std::endl; + } } - } } - // the primary test is the percent different greater than given tol - else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); - std::cout << " tolerance was: " << tol << std::endl; - } - } - } return fail; } template -bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) { +bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) +{ bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i<2*vlen; i+=2) { - t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] }; - t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); - t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]); + for (unsigned int i = 0; i < 2 * vlen; i += 2) + { + t diff[2] = {in1[i] - in2[i], in1[i + 1] - in2[i + 1]}; + t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); + t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]); - // for very small numbers we'll see round off errors due to limited - // precision. So a special test case... - if (norm < 1e-30) { - if (err > tol) - { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; - std::cout << " tolerance was: " << tol << std::endl; + // for very small numbers we'll see round off errors due to limited + // precision. So a special test case... + if (norm < 1e-30) + { + if (err > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] << "j"; + std::cout << " tolerance was: " << tol << std::endl; + } + } + } + // the primary test is the percent different greater than given tol + else if ((err / norm) > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] << "j"; + std::cout << " tolerance was: " << tol << std::endl; + } } - } } - // the primary test is the percent different greater than given tol - else if((err / norm) > tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; - std::cout << " tolerance was: " << tol << std::endl; - } - } - } return fail; } template -bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { +bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) +{ bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << static_cast(t(((t *)(in1))[i])) << " in2: " << static_cast(t(((t *)(in2))[i])); - std::cout << " tolerance was: " << tol << std::endl; - } + for (unsigned int i = 0; i < vlen; i++) + { + if (((unsigned int)abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i]))) > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i << " in1: " << static_cast(t(((t *)(in1))[i])) << " in2: " << static_cast(t(((t *)(in2))[i])); + std::cout << " tolerance was: " << tol << std::endl; + } + } } - } return fail; } -class volk_gnsssdr_qa_aligned_mem_pool{ +class volk_gnsssdr_qa_aligned_mem_pool +{ public: - void *get_new(size_t size){ + void *get_new(size_t size) + { size_t alignment = volk_gnsssdr_get_alignment(); - void* ptr = volk_gnsssdr_malloc(size, alignment); + void *ptr = volk_gnsssdr_malloc(size, alignment); memset(ptr, 0x00, size); _mems.push_back(ptr); return ptr; } - ~volk_gnsssdr_qa_aligned_mem_pool() { - for(unsigned int ii = 0; ii < _mems.size(); ++ii) { - volk_gnsssdr_free(_mems[ii]); - } + ~volk_gnsssdr_qa_aligned_mem_pool() + { + for (unsigned int ii = 0; ii < _mems.size(); ++ii) + { + volk_gnsssdr_free(_mems[ii]); + } } -private: std::vector _mems; + +private: + std::vector _mems; }; bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, - void (*manual_func)(), - std::string name, - volk_gnsssdr_test_params_t test_params, - std::vector *results, - std::string puppet_master_name -) + void (*manual_func)(), + std::string name, + volk_gnsssdr_test_params_t test_params, + std::vector *results, + std::string puppet_master_name) { return run_volk_gnsssdr_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(), test_params.vlen(), test_params.iter(), results, puppet_master_name, @@ -410,15 +479,15 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, } bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, - void (*manual_func)(), - std::string name, - float tol, - lv_32fc_t scalar, - unsigned int vlen, - unsigned int iter, - std::vector *results, - std::string puppet_master_name, - bool benchmark_mode) + void (*manual_func)(), + std::string name, + float tol, + lv_32fc_t scalar, + unsigned int vlen, + unsigned int iter, + std::vector *results, + std::string puppet_master_name, + bool benchmark_mode) { // Initialize this entry in results vector results->push_back(volk_gnsssdr_test_results_t()); @@ -439,57 +508,67 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, //first let's get a list of available architectures for the test std::vector arch_list = get_arch_list(desc); - if((!benchmark_mode) && (arch_list.size() < 2)) { - std::cout << "no architectures to test" << std::endl; - return false; - } + if ((!benchmark_mode) && (arch_list.size() < 2)) + { + std::cout << "no architectures to test" << std::endl; + return false; + } //something that can hang onto memory and cleanup when this function exits volk_gnsssdr_qa_aligned_mem_pool mem_pool; //now we have to get a function signature by parsing the name std::vector inputsig, outputsig; - try { - get_signatures_from_name(inputsig, outputsig, name); - } - catch (boost::bad_lexical_cast& error) { - std::cerr << "Error: unable to get function signature from kernel name" << std::endl; - std::cerr << " - " << name << std::endl; - return false; - } + try + { + get_signatures_from_name(inputsig, outputsig, name); + } + catch (boost::bad_lexical_cast &error) + { + std::cerr << "Error: unable to get function signature from kernel name" << std::endl; + std::cerr << " - " << name << std::endl; + return false; + } //pull the input scalars into their own vector std::vector inputsc; - for(size_t i=0; i inbuffs; - BOOST_FOREACH(volk_gnsssdr_type_t sig, inputsig) { - if(!sig.is_scalar) //we don't make buffers for scalars - inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1))); - } - for(size_t i=0; i > test_data; - for(size_t i=0; i arch_buffs; - for(size_t j=0; j arch_buffs; + for (size_t j = 0; j < outputsig.size(); j++) + { + arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size * (outputsig[j].is_complex ? 2 : 1))); + } + for (size_t j = 0; j < inputsig.size(); j++) + { + void *arch_inbuff = mem_pool.get_new(vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1)); + memcpy(arch_inbuff, inbuffs[j], vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1)); + arch_buffs.push_back(arch_inbuff); + } + test_data.push_back(arch_buffs); } - for(size_t j=0; j both_sigs; both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end()); @@ -499,270 +578,276 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, vlen = vlen - vlen_twiddle; std::chrono::time_point start, end; std::vector profile_times; - for(size_t i = 0; i < arch_list.size(); i++) { - start = std::chrono::system_clock::now(); - - switch(both_sigs.size()) + for (size_t i = 0; i < arch_list.size(); i++) { - case 1: - if(inputsc.size() == 0) + start = std::chrono::system_clock::now(); + + switch (both_sigs.size()) { - run_cast_test1((volk_gnsssdr_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } - else if(inputsc.size() == 1 && inputsc[0].is_float) - { - if(inputsc[0].is_complex) + case 1: + if (inputsc.size() == 0) { - run_cast_test1_s32fc((volk_gnsssdr_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test1((volk_gnsssdr_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } - else + else if (inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } - //ADDED BY GNSS-SDR. START - else if(inputsc.size() == 1 && !inputsc[0].is_float) - { - if(inputsc[0].is_complex) - { - if(inputsc[0].size == 2) + if (inputsc[0].is_complex) { - run_cast_test1_s16ic((volk_gnsssdr_fn_1arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test1_s32fc((volk_gnsssdr_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } else { - run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); } } - else + //ADDED BY GNSS-SDR. START + else if (inputsc.size() == 1 && !inputsc[0].is_float) { - run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + if (inputsc[0].is_complex) + { + if (inputsc[0].size == 2) + { + run_cast_test1_s16ic((volk_gnsssdr_fn_1arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + else + { + run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + } + else + { + run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } } - } - //ADDED BY GNSS-SDR. END - else throw "unsupported 1 arg function >1 scalars"; - break; - case 2: - if(inputsc.size() == 0) - { + //ADDED BY GNSS-SDR. END + else + throw "unsupported 1 arg function >1 scalars"; + break; + case 2: + if (inputsc.size() == 0) + { run_cast_test2((volk_gnsssdr_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } - else if(inputsc.size() == 1 && inputsc[0].is_float) - { - if(inputsc[0].is_complex) - { - run_cast_test2_s32fc((volk_gnsssdr_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } - else + else if (inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } - //ADDED BY GNSS-SDR. START - else if(inputsc.size() == 1 && !inputsc[0].is_float) - { - if(inputsc[0].is_complex) - { - if(inputsc[0].size == 2) + if (inputsc[0].is_complex) { - run_cast_test2_s16ic((volk_gnsssdr_fn_2arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test2_s32fc((volk_gnsssdr_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } else { - run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); } } + //ADDED BY GNSS-SDR. START + else if (inputsc.size() == 1 && !inputsc[0].is_float) + { + if (inputsc[0].is_complex) + { + if (inputsc[0].size == 2) + { + run_cast_test2_s16ic((volk_gnsssdr_fn_2arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + else + { + run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + } + else + { + run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. END else + throw "unsupported 2 arg function >1 scalars"; + break; + case 3: + if (inputsc.size() == 0) { - run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + run_cast_test3((volk_gnsssdr_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } - } - //ADDED BY GNSS-SDR. END - else throw "unsupported 2 arg function >1 scalars"; - break; - case 3: - if(inputsc.size() == 0) - { - run_cast_test3((volk_gnsssdr_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } - else if(inputsc.size() == 1 && inputsc[0].is_float) - { - if(inputsc[0].is_complex) + else if (inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test3_s32fc((volk_gnsssdr_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + if (inputsc[0].is_complex) + { + run_cast_test3_s32fc((volk_gnsssdr_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + else + { + run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } } + //ADDED BY GNSS-SDR. START + else if (inputsc.size() == 1 && !inputsc[0].is_float) + { + if (inputsc[0].is_complex) + { + { + if (inputsc[0].size == 4) + { + run_cast_test3_s16ic((volk_gnsssdr_fn_3arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + else + { + run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + } + } + else + { + run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. END else - { - run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } + throw "unsupported 3 arg function >1 scalars"; + break; + default: + throw "no function handler for this signature"; + break; } - //ADDED BY GNSS-SDR. START - else if(inputsc.size() == 1 && !inputsc[0].is_float) - { - if(inputsc[0].is_complex) - { - { - if(inputsc[0].size == 4) - { - run_cast_test3_s16ic((volk_gnsssdr_fn_3arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); - } - else - { - run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); - } - } - } - else - { - run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } - //ADDED BY GNSS-SDR. END - else throw "unsupported 3 arg function >1 scalars"; - break; - default: - throw "no function handler for this signature"; - break; + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + double arch_time = 1000.0 * elapsed_seconds.count(); + std::cout << arch_list[i] << " completed in " << arch_time << " ms" << std::endl; + volk_gnsssdr_test_time_t result; + result.name = arch_list[i]; + result.time = arch_time; + result.units = "ms"; + result.pass = true; + results->back().results[result.name] = result; + + profile_times.push_back(arch_time); } - end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - double arch_time = 1000.0 * elapsed_seconds.count(); - std::cout << arch_list[i] << " completed in " << arch_time << " ms" << std::endl; - volk_gnsssdr_test_time_t result; - result.name = arch_list[i]; - result.time = arch_time; - result.units = "ms"; - result.pass = true; - results->back().results[result.name] = result; - - profile_times.push_back(arch_time); - } - //and now compare each output to the generic output //first we have to know which output is the generic one, they aren't in order... - size_t generic_offset=0; - for(size_t i=0; i arch_results; - for(size_t i = 0; i < arch_list.size(); i++) + for (size_t i = 0; i < arch_list.size(); i++) { fail = false; - if(i != generic_offset) + if (i != generic_offset) { - for(size_t j=0; jback().results[arch_list[i]]; result->pass = !fail; @@ -778,14 +863,14 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, double best_time_u = std::numeric_limits::max(); std::string best_arch_a = "generic"; std::string best_arch_u = "generic"; - for(size_t i=0; i < arch_list.size(); i++) + for (size_t i = 0; i < arch_list.size(); i++) { - if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) + if ((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) { best_time_u = profile_times[i]; best_arch_u = arch_list[i]; } - if((profile_times[i] < best_time_a) && arch_results[i]) + if ((profile_times[i] < best_time_a) && arch_results[i]) { best_time_a = profile_times[i]; best_arch_a = arch_list[i]; @@ -795,11 +880,14 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, std::cout << "Best aligned arch: " << best_arch_a << std::endl; std::cout << "Best unaligned arch: " << best_arch_u << std::endl; - if(puppet_master_name == "NULL") { - results->back().config_name = name; - } else { - results->back().config_name = puppet_master_name; - } + if (puppet_master_name == "NULL") + { + results->back().config_name = name; + } + else + { + results->back().config_name = puppet_master_name; + } results->back().best_arch_a = best_arch_a; results->back().best_arch_u = best_arch_u; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h index ad4d7e6b9..b2a66fb58 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h @@ -25,17 +25,18 @@ #include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t #include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t -#include // for bool, false -#include // for NULL -#include // for map -#include // for string, basic_string -#include // for vector +#include // for bool, false +#include // for NULL +#include // for map +#include // for string, basic_string +#include // for vector /************************************************ * VOLK QA type definitions * ************************************************/ -struct volk_gnsssdr_type_t { +struct volk_gnsssdr_type_t +{ bool is_float; bool is_scalar; bool is_signed; @@ -44,80 +45,78 @@ struct volk_gnsssdr_type_t { std::string str; }; -class volk_gnsssdr_test_time_t { - public: - std::string name; - double time; - std::string units; - bool pass; +class volk_gnsssdr_test_time_t +{ +public: + std::string name; + double time; + std::string units; + bool pass; }; -class volk_gnsssdr_test_results_t { - public: - std::string name; - std::string config_name; - unsigned int vlen; - unsigned int iter; - std::map results; - std::string best_arch_a; - std::string best_arch_u; +class volk_gnsssdr_test_results_t +{ +public: + std::string name; + std::string config_name; + unsigned int vlen; + unsigned int iter; + std::map results; + std::string best_arch_a; + std::string best_arch_u; }; -class volk_gnsssdr_test_params_t { - private: - float _tol; - lv_32fc_t _scalar; - unsigned int _vlen; - unsigned int _iter; - bool _benchmark_mode; - std::string _kernel_regex; - public: - // ctor - volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, - bool benchmark_mode, std::string kernel_regex) : - _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), - _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex) {}; - // setters - void set_tol(float tol) {_tol=tol;}; - void set_scalar(lv_32fc_t scalar) {_scalar=scalar;}; - void set_vlen(unsigned int vlen) {_vlen=vlen;}; - void set_iter(unsigned int iter) {_iter=iter;}; - void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;}; - void set_regex(std::string regex) {_kernel_regex=regex;}; - // getters - float tol() {return _tol;}; - lv_32fc_t scalar() {return _scalar;}; - unsigned int vlen() {return _vlen;}; - unsigned int iter() {return _iter;}; - bool benchmark_mode() {return _benchmark_mode;}; - std::string kernel_regex() {return _kernel_regex;}; +class volk_gnsssdr_test_params_t +{ +private: + float _tol; + lv_32fc_t _scalar; + unsigned int _vlen; + unsigned int _iter; + bool _benchmark_mode; + std::string _kernel_regex; + +public: + // ctor + volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, + bool benchmark_mode, std::string kernel_regex) : _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex){}; + // setters + void set_tol(float tol) { _tol = tol; }; + void set_scalar(lv_32fc_t scalar) { _scalar = scalar; }; + void set_vlen(unsigned int vlen) { _vlen = vlen; }; + void set_iter(unsigned int iter) { _iter = iter; }; + void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; }; + void set_regex(std::string regex) { _kernel_regex = regex; }; + // getters + float tol() { return _tol; }; + lv_32fc_t scalar() { return _scalar; }; + unsigned int vlen() { return _vlen; }; + unsigned int iter() { return _iter; }; + bool benchmark_mode() { return _benchmark_mode; }; + std::string kernel_regex() { return _kernel_regex; }; }; -class volk_gnsssdr_test_case_t { - private: - volk_gnsssdr_func_desc_t _desc; - void(*_kernel_ptr)(); - std::string _name; - volk_gnsssdr_test_params_t _test_parameters; - std::string _puppet_master_name; - public: - volk_gnsssdr_func_desc_t desc() {return _desc;}; - void (*kernel_ptr()) () {return _kernel_ptr;}; - std::string name() {return _name;}; - std::string puppet_master_name() {return _puppet_master_name;}; - volk_gnsssdr_test_params_t test_parameters() {return _test_parameters;}; - // normal ctor - volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name, - volk_gnsssdr_test_params_t test_parameters) : - _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), - _puppet_master_name("NULL") - {}; - // ctor for puppets - volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name, - std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : - _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), - _puppet_master_name(puppet_master_name) - {}; +class volk_gnsssdr_test_case_t +{ +private: + volk_gnsssdr_func_desc_t _desc; + void (*_kernel_ptr)(); + std::string _name; + volk_gnsssdr_test_params_t _test_parameters; + std::string _puppet_master_name; + +public: + volk_gnsssdr_func_desc_t desc() { return _desc; }; + void (*kernel_ptr())() { return _kernel_ptr; }; + std::string name() { return _name; }; + std::string puppet_master_name() { return _puppet_master_name; }; + volk_gnsssdr_test_params_t test_parameters() { return _test_parameters; }; + // normal ctor + volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name, + volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name("NULL"){}; + // ctor for puppets + volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name, + std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name(puppet_master_name){}; }; /************************************************ @@ -130,58 +129,57 @@ void random_floats(float *buf, unsigned n); bool run_volk_gnsssdr_tests( volk_gnsssdr_func_desc_t, - void(*)(), + void (*)(), std::string, volk_gnsssdr_test_params_t, std::vector *results = NULL, - std::string puppet_master_name = "NULL" - ); + std::string puppet_master_name = "NULL"); bool run_volk_gnsssdr_tests( - volk_gnsssdr_func_desc_t, - void(*)(), - std::string, - float, - lv_32fc_t, - unsigned int, - unsigned int, - std::vector *results = NULL, - std::string puppet_master_name = "NULL", - bool benchmark_mode = false -); + volk_gnsssdr_func_desc_t, + void (*)(), + std::string, + float, + lv_32fc_t, + unsigned int, + unsigned int, + std::vector *results = NULL, + std::string puppet_master_name = "NULL", + bool benchmark_mode = false); -#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ - BOOST_AUTO_TEST_CASE(func##_test) { \ - BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ - func##_get_func_desc(), (void (*)())func##_manual, \ - std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ - 0); \ +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ + BOOST_AUTO_TEST_CASE(func##_test) \ + { \ + BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ + func##_get_func_desc(), (void (*)())func##_manual, \ + std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ + 0); \ } #define VOLK_PROFILE(func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL") #define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func)) -typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place -typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input -typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input -typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char *); //one input, operate in place +typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char *); //one input vector, one scalar float input +typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char *); //one input vector, one scalar float input +typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char *); //ADDED BY GNSS-SDR. START -typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input -typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input -typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char*); //one input vector, one scalar lv_16sc_t vector input -typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char *); //one input vector, one scalar char input +typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char *); //one input vector, one scalar lv_8sc_t vector input +typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char *); //one input vector, one scalar lv_16sc_t vector input +typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char *); //ADDED BY GNSS-SDR. END -#endif // GNSS_SDR_VOLK_QA_UTILS_H +#endif // GNSS_SDR_VOLK_QA_UTILS_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/testqa.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/testqa.cc index 6e1f0fb61..7e22442da 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/testqa.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/testqa.cc @@ -18,16 +18,16 @@ */ -#include "kernel_tests.h" // for init_test_list -#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t +#include "kernel_tests.h" // for init_test_list +#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t #include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t -#include // for bool, false, true -#include // for operator<<, basic_ostream, endl, char... -#include // IWYU pragma: keep -#include // for map, map<>::iterator, _Rb_tree_iterator -#include // for string, operator<< -#include // for pair -#include // for vector +#include // for bool, false, true +#include // for operator<<, basic_ostream, endl, char... +#include // IWYU pragma: keep +#include // for map, map<>::iterator, _Rb_tree_iterator +#include // for string, operator<< +#include // for pair +#include // for vector void print_qa_xml(std::vector results, unsigned int nfails); @@ -49,38 +49,44 @@ int main() std::vector qa_failures; std::vector results; // Test every kernel reporting failures when they occur - for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { - bool qa_result = false; - volk_gnsssdr_test_case_t test_case = test_cases[ii]; - try { - qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), - test_case.test_parameters(), &results, test_case.puppet_master_name()); - } - catch(...) { - // TODO: what exceptions might we need to catch and how do we handle them? - std::cerr << "Exception found on kernel: " << test_case.name() << std::endl; - qa_result = false; - } + for (unsigned int ii = 0; ii < test_cases.size(); ++ii) + { + bool qa_result = false; + volk_gnsssdr_test_case_t test_case = test_cases[ii]; + try + { + qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), + test_case.test_parameters(), &results, test_case.puppet_master_name()); + } + catch (...) + { + // TODO: what exceptions might we need to catch and how do we handle them? + std::cerr << "Exception found on kernel: " << test_case.name() << std::endl; + qa_result = false; + } - if(qa_result) { - std::cerr << "Failure on " << test_case.name() << std::endl; - qa_failures.push_back(test_case.name()); + if (qa_result) + { + std::cerr << "Failure on " << test_case.name() << std::endl; + qa_failures.push_back(test_case.name()); + } } - } // Generate XML results print_qa_xml(results, qa_failures.size()); // Summarize QA results std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of " - << test_cases.size() << " tests." << std::endl; - if(qa_failures.size() > 0) { - std::cerr << "The following kernels failed QA:" << std::endl; - for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) { - std::cerr << " " << qa_failures[ii] << std::endl; + << test_cases.size() << " tests." << std::endl; + if (qa_failures.size() > 0) + { + std::cerr << "The following kernels failed QA:" << std::endl; + for (unsigned int ii = 0; ii < qa_failures.size(); ++ii) + { + std::cerr << " " << qa_failures[ii] << std::endl; + } + qa_ret_val = 1; } - qa_ret_val = 1; - } return qa_ret_val; } @@ -95,34 +101,34 @@ void print_qa_xml(std::vector results, unsigned int qa_file.open(".unittest/kernels.xml"); qa_file << "" << std::endl; - qa_file << "" << std::endl; + qa_file << "" << std::endl; // Results are in a vector by kernel. Each element has a result // map containing time and arch name with test result - for(unsigned int ii=0; ii < results.size(); ++ii) { - volk_gnsssdr_test_results_t result = results[ii]; - qa_file << " " << std::endl; + for (unsigned int ii = 0; ii < results.size(); ++ii) + { + volk_gnsssdr_test_results_t result = results[ii]; + qa_file << " " << std::endl; - std::map::iterator kernel_time_pair; - for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) { - volk_gnsssdr_test_time_t test_time = kernel_time_pair->second; - qa_file << " " << std::endl; - if(!test_time.pass) - qa_file << " " << - "" << std::endl; - qa_file << " " << std::endl; + std::map::iterator kernel_time_pair; + for (kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) + { + volk_gnsssdr_test_time_t test_time = kernel_time_pair->second; + qa_file << " " << std::endl; + if (!test_time.pass) + qa_file << " " + << "" << std::endl; + qa_file << " " << std::endl; + } + qa_file << " " << std::endl; } - qa_file << " " << std::endl; - } qa_file << "" << std::endl; qa_file.close(); - } - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_malloc.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_malloc.c index 3f1bcdd44..d92325f48 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_malloc.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_malloc.c @@ -43,15 +43,16 @@ void *volk_gnsssdr_malloc(size_t size, size_t alignment) return malloc(size); int err = posix_memalign(&ptr, alignment, size); - if(err == 0) + if (err == 0) { return ptr; } else { fprintf(stderr, - "VOLK_GNSSSDR: Error allocating memory " - "(posix_memalign: error %d: %s)\n", err, strerror(err)); + "VOLK_GNSSSDR: Error allocating memory " + "(posix_memalign: error %d: %s)\n", + err, strerror(err)); return NULL; } } @@ -68,7 +69,7 @@ void volk_gnsssdr_free(void *ptr) void *volk_gnsssdr_malloc(size_t size, size_t alignment) { void *ptr = _aligned_malloc(size, alignment); - if(ptr == NULL) + if (ptr == NULL) { fprintf(stderr, "VOLK_GNSSSDR: Error allocating memory (_aligned_malloc)\n"); } @@ -81,7 +82,7 @@ void volk_gnsssdr_free(void *ptr) } // No standard handlers; we'll do it ourselves. -#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN +#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN struct block_info { @@ -102,7 +103,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment) real = malloc(size + (2 * alignment - 1)); /* Get pointer to the various zones */ - user = (void *)((((uintptr_t) real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1)); + user = (void *)((((uintptr_t)real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1)); info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info)); /* Store the info for the free */ @@ -112,8 +113,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment) return user; } -void -volk_gnsssdr_free(void *ptr) +void volk_gnsssdr_free(void *ptr) { struct block_info *info; @@ -124,6 +124,6 @@ volk_gnsssdr_free(void *ptr) free(info->real); } -#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN +#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN //#endif // _ISOC11_SOURCE diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_prefs.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_prefs.c index b77aed467..b9a55a284 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_prefs.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_prefs.c @@ -26,16 +26,17 @@ void volk_gnsssdr_get_config_path(char *path) { if (!path) return; const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config"; - const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; //non-hidden + const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; // non-hidden char *home = NULL; //allows config redirection via env variable home = getenv("VOLK_CONFIGPATH"); - if(home!=NULL){ - strncpy(path,home,512); - strcat(path,suffix2); - return; - } + if (home != NULL) + { + strncpy(path, home, 512); + strcat(path, suffix2); + return; + } if (home == NULL) home = getenv("HOME"); if (home == NULL) home = getenv("APPDATA"); @@ -57,16 +58,16 @@ size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **prefs_res) //get the config path volk_gnsssdr_get_config_path(path); - if (!path[0]) return n_arch_prefs; //no prefs found + if (!path[0]) return n_arch_prefs; //no prefs found config_file = fopen(path, "r"); - if(!config_file) return n_arch_prefs; //no prefs found + if (!config_file) return n_arch_prefs; //no prefs found //reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs - while(fgets(line, sizeof(line), config_file) != NULL) + while (fgets(line, sizeof(line), config_file) != NULL) { - prefs = (volk_gnsssdr_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs)); + prefs = (volk_gnsssdr_arch_pref_t *)realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs)); volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs; - if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5)) + if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5)) { n_arch_prefs++; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c index d1871426d..96fa4e77e 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c @@ -29,7 +29,7 @@ inline unsigned __popcnt(unsigned num) { unsigned pop = 0; - while(num) + while (num) { if (num & 0x1) pop++; num >>= 1; @@ -39,15 +39,15 @@ inline unsigned __popcnt(unsigned num) #endif int volk_gnsssdr_get_index( - const char *impl_names[], //list of implementations by name - const size_t n_impls, //number of implementations available - const char *impl_name //the implementation name to find - ) + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +) { unsigned int i; for (i = 0; i < n_impls; i++) { - if(!strncmp(impl_names[i], impl_name, 20)) + if (!strncmp(impl_names[i], impl_name, 20)) { return i; } @@ -55,24 +55,24 @@ int volk_gnsssdr_get_index( //TODO return -1; //something terrible should happen here fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n"); - return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now + return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now } int volk_gnsssdr_rank_archs( - const char *kern_name, //name of the kernel to rank - const char *impl_names[], //list of implementations by name - const int* impl_deps, //requirement mask per implementation - const bool* alignment, //alignment status of each implementation - size_t n_impls, //number of implementations available - const bool align //if false, filter aligned implementations + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int *impl_deps, //requirement mask per implementation + const bool *alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations ) { size_t i; static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs; static size_t n_arch_prefs = 0; static int prefs_loaded = 0; - if(!prefs_loaded) + if (!prefs_loaded) { n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs); prefs_loaded = 1; @@ -81,17 +81,17 @@ int volk_gnsssdr_rank_archs( // If we've defined VOLK_GENERIC to be anything, always return the // 'generic' kernel. Used in GR's QA code. char *gen_env = getenv("VOLK_GENERIC"); - if(gen_env) + if (gen_env) { return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); } //now look for the function name in the prefs list - for(i = 0; i < n_arch_prefs; i++) + for (i = 0; i < n_arch_prefs; i++) { - if(!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it + if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it { - const char *impl_name = align? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u; + const char *impl_name = align ? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u; return volk_gnsssdr_get_index(impl_names, n_impls, impl_name); } } @@ -101,7 +101,7 @@ int volk_gnsssdr_rank_archs( size_t best_index_u = 0; int best_value_a = -1; int best_value_u = -1; - for(i = 0; i < n_impls; i++) + for (i = 0; i < n_impls; i++) { const signed val = __popcnt(impl_deps[i]); if (alignment[i] && val > best_value_a) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h index 312fb9f47..ba0638a54 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h @@ -23,23 +23,24 @@ #include #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -int volk_gnsssdr_get_index( - const char *impl_names[], //list of implementations by name - const size_t n_impls, //number of implementations available - const char *impl_name //the implementation name to find -); + int volk_gnsssdr_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find + ); -int volk_gnsssdr_rank_archs( - const char *kern_name, //name of the kernel to rank - const char *impl_names[], //list of implementations by name - const int* impl_deps, //requirement mask per implementation - const bool* alignment, //alignment status of each implementation - size_t n_impls, //number of implementations available - const bool align //if false, filter aligned implementations -); + int volk_gnsssdr_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int *impl_deps, //requirement mask per implementation + const bool *alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations + ); #ifdef __cplusplus } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c index 95f5f057c..482f0e461 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c @@ -31,80 +31,90 @@ static intptr_t __alignment_mask = 0; struct volk_gnsssdr_machine *get_machine(void) { - extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; - extern unsigned int n_volk_gnsssdr_machines; - static struct volk_gnsssdr_machine *machine = NULL; + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; + static struct volk_gnsssdr_machine *machine = NULL; - if(machine != NULL) - return machine; - else { - unsigned int max_score = 0; - unsigned int i; - struct volk_gnsssdr_machine *max_machine = NULL; - for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { - if(volk_gnsssdr_machines[i]->caps > max_score) { - max_score = volk_gnsssdr_machines[i]->caps; - max_machine = volk_gnsssdr_machines[i]; + if (machine != NULL) + return machine; + else + { + unsigned int max_score = 0; + unsigned int i; + struct volk_gnsssdr_machine *max_machine = NULL; + for (i = 0; i < n_volk_gnsssdr_machines; i++) + { + if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) + { + if (volk_gnsssdr_machines[i]->caps > max_score) + { + max_score = volk_gnsssdr_machines[i]->caps; + max_machine = volk_gnsssdr_machines[i]; + } + } + } + machine = max_machine; + //printf("Using Volk machine: %s\n", machine->name); + __alignment = machine->alignment; + __alignment_mask = (intptr_t)(__alignment - 1); + return machine; } - } - } - machine = max_machine; - //printf("Using Volk machine: %s\n", machine->name); - __alignment = machine->alignment; - __alignment_mask = (intptr_t)(__alignment-1); - return machine; - } } void volk_gnsssdr_list_machines(void) { - extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; - extern unsigned int n_volk_gnsssdr_machines; + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; - unsigned int i; - for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { - printf("%s;", volk_gnsssdr_machines[i]->name); - } - } - printf("\n"); + unsigned int i; + for (i = 0; i < n_volk_gnsssdr_machines; i++) + { + if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) + { + printf("%s;", volk_gnsssdr_machines[i]->name); + } + } + printf("\n"); } -const char* volk_gnsssdr_get_machine(void) +const char *volk_gnsssdr_get_machine(void) { - extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; - extern unsigned int n_volk_gnsssdr_machines; - static struct volk_gnsssdr_machine *machine = NULL; + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; + static struct volk_gnsssdr_machine *machine = NULL; - if(machine != NULL) - return machine->name; - else { - unsigned int max_score = 0; - unsigned int i; - struct volk_gnsssdr_machine *max_machine = NULL; - for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { - if(volk_gnsssdr_machines[i]->caps > max_score) { - max_score = volk_gnsssdr_machines[i]->caps; - max_machine = volk_gnsssdr_machines[i]; + if (machine != NULL) + return machine->name; + else + { + unsigned int max_score = 0; + unsigned int i; + struct volk_gnsssdr_machine *max_machine = NULL; + for (i = 0; i < n_volk_gnsssdr_machines; i++) + { + if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) + { + if (volk_gnsssdr_machines[i]->caps > max_score) + { + max_score = volk_gnsssdr_machines[i]->caps; + max_machine = volk_gnsssdr_machines[i]; + } + } + } + machine = max_machine; + return machine->name; } - } - } - machine = max_machine; - return machine->name; - } } size_t volk_gnsssdr_get_alignment(void) { - get_machine(); //ensures alignment is set + get_machine(); //ensures alignment is set return __alignment; } bool volk_gnsssdr_is_aligned(const void *ptr) { - return ((intptr_t)(ptr) & __alignment_mask) == 0; + return ((intptr_t)(ptr)&__alignment_mask) == 0; } #define LV_HAVE_GENERIC @@ -113,13 +123,12 @@ bool volk_gnsssdr_is_aligned(const void *ptr) %for kern in kernels: %if kern.has_dispatcher: -#include //pulls in the dispatcher +#include //pulls in the dispatcher %endif static inline void __${kern.name}_d(${kern.arglist_full}) { - %if kern.has_dispatcher: - ${kern.name}_dispatcher(${kern.arglist_names}); + % if kern.has_dispatcher : ${kern.name} _dispatcher(${kern.arglist_names}); return; %endif @@ -131,41 +140,41 @@ static inline void __${kern.name}_d(${kern.arglist_full}) %endfor 0<% end_open_parens = ')'*num_open_parens %>${end_open_parens} )){ - ${kern.name}_a(${kern.arglist_names}); + ${kern.name} _a(${kern.arglist_names}); } else{ - ${kern.name}_u(${kern.arglist_names}); + ${kern.name} _u(${kern.arglist_names}); } } static inline void __init_${kern.name}(void) { - const char *name = get_machine()->${kern.name}_name; - const char **impl_names = get_machine()->${kern.name}_impl_names; - const int *impl_deps = get_machine()->${kern.name}_impl_deps; - const bool *alignment = get_machine()->${kern.name}_impl_alignment; - const size_t n_impls = get_machine()->${kern.name}_n_impls; - const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/); - const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/); - ${kern.name}_a = get_machine()->${kern.name}_impls[index_a]; - ${kern.name}_u = get_machine()->${kern.name}_impls[index_u]; + const char *name = get_machine()->${kern.name} _name; + const char **impl_names = get_machine()->${kern.name} _impl_names; + const int *impl_deps = get_machine()->${kern.name} _impl_deps; + const bool *alignment = get_machine()->${kern.name} _impl_alignment; + const size_t n_impls = get_machine()->${kern.name} _n_impls; + const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true /*aligned*/); + const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false /*unaligned*/); + ${kern.name} _a = get_machine()->${kern.name} _impls[index_a]; + ${kern.name} _u = get_machine()->${kern.name} _impls[index_u]; - assert(${kern.name}_a); - assert(${kern.name}_u); + assert(${kern.name} _a); + assert(${kern.name} _u); - ${kern.name} = &__${kern.name}_d; + ${kern.name} = &__${kern.name} _d; } -static inline void __${kern.name}_a(${kern.arglist_full}) +static inline void __${kern.name} _a(${kern.arglist_full}) { __init_${kern.name}(); - ${kern.name}_a(${kern.arglist_names}); + ${kern.name} _a(${kern.arglist_names}); } -static inline void __${kern.name}_u(${kern.arglist_full}) +static inline void __${kern.name} _u(${kern.arglist_full}) { __init_${kern.name}(); - ${kern.name}_u(${kern.arglist_names}); + ${kern.name} _u(${kern.arglist_names}); } static inline void __${kern.name}(${kern.arglist_full}) @@ -174,34 +183,32 @@ static inline void __${kern.name}(${kern.arglist_full}) ${kern.name}(${kern.arglist_names}); } -${kern.pname} ${kern.name}_a = &__${kern.name}_a; -${kern.pname} ${kern.name}_u = &__${kern.name}_u; -${kern.pname} ${kern.name} = &__${kern.name}; +${kern.pname} ${kern.name} _a = &__${kern.name} _a; +${kern.pname} ${kern.name} _u = &__${kern.name} _u; +${kern.pname} ${kern.name} = &__${kern.name}; -void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name) +void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name) { const int index = volk_gnsssdr_get_index( - get_machine()->${kern.name}_impl_names, - get_machine()->${kern.name}_n_impls, - impl_name - ); - get_machine()->${kern.name}_impls[index]( - ${kern.arglist_names} - ); + get_machine()->${kern.name} _impl_names, + get_machine()->${kern.name} _n_impls, + impl_name); + get_machine()->${kern.name} _impls[index]( + ${kern.arglist_names}); } -volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) { - const char **impl_names = get_machine()->${kern.name}_impl_names; - const int *impl_deps = get_machine()->${kern.name}_impl_deps; - const bool *alignment = get_machine()->${kern.name}_impl_alignment; - const size_t n_impls = get_machine()->${kern.name}_n_impls; +volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void) +{ + const char **impl_names = get_machine()->${kern.name} _impl_names; + const int *impl_deps = get_machine()->${kern.name} _impl_deps; + const bool *alignment = get_machine()->${kern.name} _impl_alignment; + const size_t n_impls = get_machine()->${kern.name} _n_impls; volk_gnsssdr_func_desc_t desc = { impl_names, impl_deps, alignment, - n_impls - }; + n_impls}; return desc; } -%endfor +% endfor diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h index 556d67f8e..133eef3c3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h @@ -42,7 +42,7 @@ typedef struct volk_gnsssdr_func_desc VOLK_API void volk_gnsssdr_list_machines(void); //! Returns the name of the machine this instance will use -VOLK_API const char* volk_gnsssdr_get_machine(void); +VOLK_API const char *volk_gnsssdr_get_machine(void); //! Get the machine alignment in bytes VOLK_API size_t volk_gnsssdr_get_alignment(void); @@ -74,19 +74,19 @@ VOLK_API bool volk_gnsssdr_is_aligned(const void *ptr); extern VOLK_API ${kern.pname} ${kern.name}; //! A function pointer to the fastest aligned implementation -extern VOLK_API ${kern.pname} ${kern.name}_a; +extern VOLK_API ${kern.pname} ${kern.name} _a; //! A function pointer to the fastest unaligned implementation -extern VOLK_API ${kern.pname} ${kern.name}_u; +extern VOLK_API ${kern.pname} ${kern.name} _u; //! Call into a specific implementation given by name -extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name); +extern VOLK_API void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name); //! Get description parameters for this kernel -extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void); -%endfor +extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void); +% endfor -__VOLK_DECL_END + __VOLK_DECL_END #endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h index ed55d0b58..c941407b9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h @@ -21,7 +21,8 @@ %for i, arch in enumerate(archs): //#ifndef LV_${arch.name.upper()} -#define LV_${arch.name.upper()} ${i} +#define LV_$ \ + {arch.name.upper()} $ { i } //#endif %endfor diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c index 1d094a87a..b93781a70 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c @@ -24,50 +24,54 @@ struct VOLK_CPU volk_gnsssdr_cpu; #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) - #define VOLK_CPU_x86 +#define VOLK_CPU_x86 #endif #if defined(VOLK_CPU_x86) //implement get cpuid for gcc compilers using a system or local copy of cpuid.h #if defined(__GNUC__) - #include - #define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3) - #define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int*)regs), *((unsigned int*)regs+1), *((unsigned int*)regs+2), *((unsigned int*)regs+3)) +#include +#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r + 0, (unsigned int *)r + 1, (unsigned int *)r + 2, (unsigned int *)r + 3) +#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int *)regs), *((unsigned int *)regs + 1), *((unsigned int *)regs + 2), *((unsigned int *)regs + 3)) - /* Return Intel AVX extended CPU capabilities register. +/* Return Intel AVX extended CPU capabilities register. * This function will bomb on non-AVX-capable machines, so * check for AVX capability before executing. */ - #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV) - static inline unsigned long long _xgetbv(unsigned int index){ - unsigned int eax, edx; - __VOLK_ASM __VOLK_VOLATILE ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); - return ((unsigned long long)edx << 32) | eax; - } - #define __xgetbv() _xgetbv(0) - #else - #define __xgetbv() 0 - #endif +#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV) +static inline unsigned long long _xgetbv(unsigned int index) +{ + unsigned int eax, edx; + __VOLK_ASM __VOLK_VOLATILE("xgetbv" + : "=a"(eax), "=d"(edx) + : "c"(index)); + return ((unsigned long long)edx << 32) | eax; +} +#define __xgetbv() _xgetbv(0) +#else +#define __xgetbv() 0 +#endif //implement get cpuid for MSVC compilers using __cpuid intrinsic #elif defined(_MSC_VER) && defined(HAVE_INTRIN_H) - #include - #define cpuid_x86(op, r) __cpuid(((int*)r), op) +#include +#define cpuid_x86(op, r) __cpuid(((int *)r), op) - #if defined(_XCR_XFEATURE_ENABLED_MASK) - #define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) - #else - #define __xgetbv() 0 - #endif +#if defined(_XCR_XFEATURE_ENABLED_MASK) +#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) +#else +#define __xgetbv() 0 +#endif #else - #error "A get cpuid for volk_gnsssdr is not available on this compiler..." -#endif //defined(__GNUC__) +#error "A get cpuid for volk_gnsssdr is not available on this compiler..." +#endif //defined(__GNUC__) -#endif //defined(VOLK_CPU_x86) +#endif //defined(VOLK_CPU_x86) -static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) { +static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) +{ #if defined(VOLK_CPU_x86) unsigned int regs[4] = {0}; cpuid_x86_count(level, count, regs); @@ -77,10 +81,11 @@ static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int #endif } -static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) { +static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) +{ #if defined(VOLK_CPU_x86) unsigned int regs[4]; - memset(regs, 0, sizeof(unsigned int)*4); + memset(regs, 0, sizeof(unsigned int) * 4); cpuid_x86(op, regs); return regs[reg] >> bit & 0x01; #else @@ -88,10 +93,11 @@ static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsi #endif } -static inline unsigned int check_extended_cpuid(unsigned int val) { +static inline unsigned int check_extended_cpuid(unsigned int val) +{ #if defined(VOLK_CPU_x86) unsigned int regs[4]; - memset(regs, 0, sizeof(unsigned int)*4); + memset(regs, 0, sizeof(unsigned int) * 4); cpuid_x86(0x80000000, regs); return regs[0] >= val; #else @@ -99,7 +105,8 @@ static inline unsigned int check_extended_cpuid(unsigned int val) { #endif } -static inline unsigned int get_avx_enabled(void) { +static inline unsigned int get_avx_enabled(void) +{ #if defined(VOLK_CPU_x86) return __xgetbv() & 0x6; #else @@ -107,7 +114,8 @@ static inline unsigned int get_avx_enabled(void) { #endif } -static inline unsigned int get_avx2_enabled(void) { +static inline unsigned int get_avx2_enabled(void) +{ #if defined(VOLK_CPU_x86) return __xgetbv() & 0x6; #else @@ -117,28 +125,30 @@ static inline unsigned int get_avx2_enabled(void) { //neon detection is linux specific #if defined(__arm__) && defined(__linux__) - #include - #include - #include - #define VOLK_CPU_ARM +#include +#include +#include +#define VOLK_CPU_ARM #endif -static int has_neon(void){ +static int has_neon(void) +{ #if defined(VOLK_CPU_ARM) FILE *auxvec_f; unsigned long auxvec[2]; unsigned int found_neon = 0; auxvec_f = fopen("/proc/self/auxv", "rb"); - if(!auxvec_f) return 0; + if (!auxvec_f) return 0; size_t r = 1; //so auxv is basically 32b of ID and 32b of value //so it goes like this - while(!found_neon && r) { - r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f); - if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) - found_neon = 1; - } + while (!found_neon && r) + { + r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f); + if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) + found_neon = 1; + } fclose(auxvec_f); return found_neon; @@ -148,50 +158,59 @@ static int has_neon(void){ } %for arch in archs: -static int i_can_has_${arch.name} (void) { +static int i_can_has_${arch.name} (void) +{ %for check, params in arch.checks: if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0; - %endfor - return 1; + % endfor return 1; } -%endfor +% endfor #if defined(HAVE_FENV_H) - #if defined(FE_TONEAREST) - #include - static inline void set_float_rounding(void){ - fesetround(FE_TONEAREST); - } - #else - static inline void set_float_rounding(void){ - //do nothing - } - #endif -#elif defined(_MSC_VER) - #include - static inline void set_float_rounding(void){ - unsigned int cwrd; - _controlfp_s(&cwrd, 0, 0); - _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC); - } +#if defined(FE_TONEAREST) +#include + static inline void + set_float_rounding(void) +{ + fesetround(FE_TONEAREST); +} #else - static inline void set_float_rounding(void){ - //do nothing - } + static inline void + set_float_rounding(void) +{ + //do nothing +} +#endif +#elif defined(_MSC_VER) +#include + static inline void + set_float_rounding(void) +{ + unsigned int cwrd; + _controlfp_s(&cwrd, 0, 0); + _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC); +} +#else + static inline void + set_float_rounding(void) +{ + //do nothing +} #endif -void volk_gnsssdr_cpu_init() { +void volk_gnsssdr_cpu_init() +{ %for arch in archs: volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name}; - %endfor - set_float_rounding(); + % endfor + set_float_rounding(); } -unsigned int volk_gnsssdr_get_lvarch() { +unsigned int volk_gnsssdr_get_lvarch() +{ unsigned int retval = 0; volk_gnsssdr_cpu_init(); %for arch in archs: retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()}; - %endfor - return retval; + % endfor return retval; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h index 20dbac2cc..160274eba 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h @@ -23,16 +23,17 @@ __VOLK_DECL_BEGIN -struct VOLK_CPU { +struct VOLK_CPU +{ %for arch in archs: int (*has_${arch.name}) (); - %endfor + % endfor }; extern struct VOLK_CPU volk_gnsssdr_cpu; -void volk_gnsssdr_cpu_init (); -unsigned int volk_gnsssdr_get_lvarch (); +void volk_gnsssdr_cpu_init(); +unsigned int volk_gnsssdr_get_lvarch(); __VOLK_DECL_END diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c index c6182cb50..8e0e7ebd3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c @@ -20,7 +20,11 @@ <% arch_names = this_machine.arch_names %> %for arch in this_machine.archs: -#define LV_HAVE_${arch.name.upper()} 1 +#define LV_HAVE_$ \ + { \ + arch.name.upper() \ + } \ + 1 %endfor #include @@ -35,7 +39,9 @@ #include %endfor -struct volk_gnsssdr_machine volk_gnsssdr_machine_${this_machine.name} = { +struct volk_gnsssdr_machine volk_gnsssdr_machine_$ +{ + this_machine.name} = { <% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list}, <% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name}, ${this_machine.alignment}, diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c index 1485a34e0..3e78b65e3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c @@ -22,10 +22,10 @@ struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = { %for machine in machines: -#ifdef LV_MACHINE_${machine.name.upper()} +#ifdef LV_MACHINE_${machine.name.upper() } &volk_gnsssdr_machine_${machine.name}, #endif %endfor }; -unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines)/sizeof(*volk_gnsssdr_machines); +unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines) / sizeof(*volk_gnsssdr_machines); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h index 10e955e25..3e2cf8d2b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h @@ -27,26 +27,30 @@ __VOLK_DECL_BEGIN -struct volk_gnsssdr_machine { - const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format) +struct volk_gnsssdr_machine +{ + const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format) const char *name; - const size_t alignment; //the maximum byte alignment required for functions in this library + const size_t alignment; //the maximum byte alignment required for functions in this library %for kern in kernels: const char *${kern.name}_name; - const char *${kern.name}_impl_names[<%len_archs=len(archs)%>${len_archs}]; - const int ${kern.name}_impl_deps[${len_archs}]; - const bool ${kern.name}_impl_alignment[${len_archs}]; - const ${kern.pname} ${kern.name}_impls[${len_archs}]; - const size_t ${kern.name}_n_impls; - %endfor + const char *${kern.name} _impl_names[<% len_archs = len(archs) %> ${len_archs}]; + const int ${kern.name} _impl_deps[${len_archs}]; + const bool ${kern.name} _impl_alignment[${len_archs}]; + const ${kern.pname} ${kern.name} _impls[${len_archs}]; + const size_t ${kern.name} _n_impls; + % endfor }; %for machine in machines: -#ifdef LV_MACHINE_${machine.name.upper()} -extern struct volk_gnsssdr_machine volk_gnsssdr_machine_${machine.name}; +#ifdef LV_MACHINE_${machine.name.upper() } +extern struct volk_gnsssdr_machine volk_gnsssdr_machine_$ +{ + machine.name +}; #endif -%endfor +% endfor -__VOLK_DECL_END + __VOLK_DECL_END -#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h index def7e24c3..e28aa5392 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h @@ -24,6 +24,6 @@ %for kern in kernels: typedef void (*${kern.pname})(${kern.arglist_types}); -%endfor +% endfor #endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/