From f924005733f732c2750a5d2c651356cf3893513c Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Sat, 3 Mar 2018 11:52:27 +0100 Subject: [PATCH 1/2] Add mention to http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting --- CONTRIBUTING.md | 3 +++ docs/PULL_REQUEST_TEMPLATE.md | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 12cc1328f..3268e8657 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -128,6 +128,9 @@ $ git pull --rebase upstream next ### How to submit a pull request +Before submitting you code, please be sure to apply clang-format +(see http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting). + When the contribution is ready, you can [submit a pull request](https://github.com/gnss-sdr/gnss-sdr/compare/). Head to your GitHub repository, switch to your `my_feature` branch, and click the diff --git a/docs/PULL_REQUEST_TEMPLATE.md b/docs/PULL_REQUEST_TEMPLATE.md index b4e04e2c1..bb69db911 100644 --- a/docs/PULL_REQUEST_TEMPLATE.md +++ b/docs/PULL_REQUEST_TEMPLATE.md @@ -5,7 +5,8 @@ Before submitting your pull request, please make sure the following is done: 2. If you are a first-time contributor, after your pull request you will be asked to sign an Individual Contributor License Agreement ([CLA](https://en.wikipedia.org/wiki/Contributor_License_Agreement)) before your code gets accepted into `master`. This license is for your protection as a Contributor as well as for the protection of [CTTC](http://www.cttc.es/); it does not change your rights to use your own contributions for any other purpose. Except for the license granted therein to CTTC and recipients of software distributed by CTTC, you reserve all right, title, and interest in and to your contributions. The information you provide in that CLA will be maintained in accordance with [CTTC's privacy policy](http://www.cttc.es/privacy/). 3. You have read the [Contributing Guidelines](https://github.com/gnss-sdr/gnss-sdr/blob/master/CONTRIBUTING.md). 4. You have read the [coding style guide](http://gnss-sdr.org/coding-style/). - 5. You have forked the [gnss-sdr upstream repository](https://github.com/gnss-sdr/gnss-sdr) and have created your branch from `next` (or any other currently living branch in the upstream repository). - 6. Please include a description of your changes here. + 5. Specifically, you have read [about clang-format](http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting) and you have applied it. + 6. You have forked the [gnss-sdr upstream repository](https://github.com/gnss-sdr/gnss-sdr) and have created your branch from `next` (or any other currently living branch in the upstream repository). + 7. Please include a description of your changes here. **Please feel free to delete this line and the above text once you have read it and in case you want to go on with your pull request.** \ No newline at end of file From 891478cf2c2097010571e55ad9b3c588fd68caff Mon Sep 17 00:00:00 2001 From: Carles Fernandez Date: Sat, 3 Mar 2018 12:09:45 +0100 Subject: [PATCH 2/2] Apply automated code formatting to volk-gnsssdr See http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting --- .../apps/volk_gnsssdr-config-info.cc | 52 +- .../apps/volk_gnsssdr_option_helpers.cc | 269 ++- .../apps/volk_gnsssdr_option_helpers.h | 9 +- .../volk_gnsssdr/apps/volk_gnsssdr_profile.cc | 378 +-- .../volk_gnsssdr/apps/volk_gnsssdr_profile.h | 8 +- .../volk_gnsssdr/saturation_arithmetic.h | 4 +- .../volk_gnsssdr_avx_intrinsics.h | 51 +- .../volk_gnsssdr/volk_gnsssdr_common.h | 140 +- .../volk_gnsssdr/volk_gnsssdr_complex.h | 36 +- .../volk_gnsssdr_neon_intrinsics.h | 28 +- .../include/volk_gnsssdr/volk_gnsssdr_prefs.h | 6 +- .../volk_gnsssdr/volk_gnsssdr_sine_table.h | 2048 ++++++++--------- .../volk_gnsssdr_sse3_intrinsics.h | 34 +- .../volk_gnsssdr_sse_intrinsics.h | 24 +- .../volk_gnsssdr_16i_resamplerxnpuppet_16i.h | 181 +- .../volk_gnsssdr_16i_xn_resampler_16i_xn.h | 77 +- ...nsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h | 1610 ++++++------- ...dr_16ic_16i_rotator_dotprodxnpuppet_16ic.h | 261 ++- .../volk_gnsssdr_16ic_conjugate_16ic.h | 3 +- .../volk_gnsssdr_16ic_convert_32fc.h | 24 +- .../volk_gnsssdr_16ic_resampler_fast_16ic.h | 169 +- ...lk_gnsssdr_16ic_resamplerfastpuppet_16ic.h | 10 +- ..._gnsssdr_16ic_resamplerfastxnpuppet_16ic.h | 88 +- ...volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h | 180 +- .../volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h | 562 ++--- .../volk_gnsssdr_16ic_x2_dot_prod_16ic.h | 115 +- .../volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h | 215 +- ...olk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h | 192 +- .../volk_gnsssdr_16ic_x2_multiply_16ic.h | 111 +- ...gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h | 914 ++++---- ...sdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h | 90 +- .../volk_gnsssdr_16ic_xn_resampler_16ic_xn.h | 77 +- ...k_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h | 185 +- .../volk_gnsssdr_32f_index_max_32u.h | 202 +- .../volk_gnsssdr_32f_resamplerxnpuppet_32f.h | 180 +- .../volk_gnsssdr_32f_sincos_32fc.h | 200 +- .../volk_gnsssdr_32f_xn_resampler_32f_xn.h | 94 +- ...nsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h | 207 +- ...dr_32fc_32f_rotator_dotprodxnpuppet_32fc.h | 49 +- .../volk_gnsssdr_32fc_convert_16ic.h | 112 +- .../volk_gnsssdr_32fc_convert_8ic.h | 106 +- ...volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h | 223 +- ...gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h | 148 +- ...sdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h | 56 +- .../volk_gnsssdr_32fc_xn_resampler_32fc_xn.h | 114 +- .../volk_gnsssdr_64f_accumulator_64f.h | 46 +- .../volk_gnsssdr_8i_accumulator_s8i.h | 42 +- .../volk_gnsssdr_8i_index_max_16u.h | 146 +- .../volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h | 108 +- .../volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h | 26 +- .../volk_gnsssdr_8ic_conjugate_8ic.h | 13 +- .../volk_gnsssdr_8ic_magnitude_squared_8i.h | 48 +- .../volk_gnsssdr_8ic_s8ic_multiply_8ic.h | 6 +- .../volk_gnsssdr_8ic_x2_dot_prod_8ic.h | 79 +- .../volk_gnsssdr_8ic_x2_multiply_8ic.h | 12 +- .../volk_gnsssdr_8u_x2_multiply_8u.h | 16 +- .../volk_gnsssdr_s32f_sincos_32fc.h | 446 ++-- .../volk_gnsssdr_s32f_sincospuppet_32fc.h | 16 +- .../volk_gnsssdr/lib/kernel_tests.h | 26 +- .../volk_gnsssdr/lib/qa_utils.cc | 1012 ++++---- .../volk_gnsssdr/lib/qa_utils.h | 226 +- .../volk_gnsssdr/lib/testqa.cc | 112 +- .../volk_gnsssdr/lib/volk_gnsssdr_malloc.c | 18 +- .../volk_gnsssdr/lib/volk_gnsssdr_prefs.c | 23 +- .../lib/volk_gnsssdr_rank_archs.c | 38 +- .../lib/volk_gnsssdr_rank_archs.h | 29 +- .../volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c | 193 +- .../volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h | 14 +- .../tmpl/volk_gnsssdr_config_fixed.tmpl.h | 3 +- .../volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c | 165 +- .../volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h | 9 +- .../tmpl/volk_gnsssdr_machine_xxx.tmpl.c | 10 +- .../tmpl/volk_gnsssdr_machines.tmpl.c | 4 +- .../tmpl/volk_gnsssdr_machines.tmpl.h | 32 +- .../tmpl/volk_gnsssdr_typedefs.tmpl.h | 2 +- 75 files changed, 6642 insertions(+), 6120 deletions(-) mode change 100755 => 100644 src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc index 3a2c7c39f..60c421be3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr-config-info.cc @@ -20,30 +20,30 @@ #include #endif -#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine -#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t -#include // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ... -#include // for operator<<, endl, cout, ostream -#include // for string +#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine +#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t +#include // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ... +#include // for operator<<, endl, cout, ostream +#include // for string void print_alignment() { - std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl; + std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl; } void print_malloc() { - // You don't want to change the volk_malloc code, so just copy the if/else - // structure from there and give an explanation for the implementations - std::cout << "Used malloc implementation: "; - #if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN - std::cout << "posix_memalign" << std::endl; - #elif _MSC_VER >= 1400 - std::cout << "aligned_malloc" << std::endl; - #else - std::cout << "No standard handler available, using own implementation." << std::endl; - #endif + // You don't want to change the volk_malloc code, so just copy the if/else + // structure from there and give an explanation for the implementations + std::cout << "Used malloc implementation: "; +#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN + std::cout << "posix_memalign" << std::endl; +#elif _MSC_VER >= 1400 + std::cout << "aligned_malloc" << std::endl; +#else + std::cout << "No standard handler available, using own implementation." << std::endl; +#endif } @@ -54,22 +54,24 @@ int main(int argc, char **argv) our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler())); our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags())); our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines())); - our_options.add(option_t("avail-machines", "", "print VOLK_GNSSSDR machines on the current " - "platform", volk_gnsssdr_list_machines)); + our_options.add(option_t("avail-machines", "", + "print VOLK_GNSSSDR machines on the current " + "platform", + volk_gnsssdr_list_machines)); our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used", - volk_gnsssdr_get_machine())); + volk_gnsssdr_get_machine())); our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment)); our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_gnsssdr_malloc", - print_malloc)); + print_malloc)); our_options.add(option_t("version", "v", "print the VOLK_GNSSSDR version", volk_gnsssdr_version())); try - { + { our_options.parse(argc, argv); - } - catch(...) - { + } + catch (...) + { return 1; - } + } return 0; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.cc index 61e085423..a6a263a20 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.cc @@ -17,157 +17,182 @@ */ #include "volk_gnsssdr_option_helpers.h" -#include // IWYU pragma: keep -#include // IWYU pragma: keep -#include // IWYU pragma: keep -#include // for exception -#include // for operator<<, endl, basic_ostream, cout, ostream -#include // for pair - +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // IWYU pragma: keep +#include // for exception +#include // for operator<<, endl, basic_ostream, cout, ostream +#include // for pair /* * Option type */ option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback(callback) { option_type = VOID_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback(callback) { option_type = VOID_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = INT_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) { option_type = INT_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) { option_type = FLOAT_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = BOOL_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) { option_type = BOOL_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - callback((void (*)()) callback) { option_type = STRING_CALLBACK; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + callback((void (*)())callback) { option_type = STRING_CALLBACK; } option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval) - : longform("--" + longform), - shortform("-" + shortform), - msg(msg), - printval(printval) { option_type = STRING; } + : longform("--" + longform), + shortform("-" + shortform), + msg(msg), + printval(printval) { option_type = STRING; } /* * Option List */ -option_list::option_list(std::string program_name) : - program_name(program_name) { - { internal_list = std::vector(); } -} - -void option_list::add(const option_t & opt) { internal_list.push_back(opt); } - -void option_list::parse(int argc, char **argv) { - for (int arg_number = 0; arg_number < argc; ++arg_number) { - for (std::vector::iterator this_option = internal_list.begin(); - this_option != internal_list.end(); - this_option++) { - if (this_option->longform == std::string(argv[arg_number]) || - this_option->shortform == std::string(argv[arg_number])) { - switch (this_option->option_type) { - case VOID_CALLBACK: - this_option->callback(); - break; - case INT_CALLBACK: - try { - int int_val = std::stoi(argv[++arg_number]); - ((void (*)(int)) this_option->callback)(int_val); - } catch (std::exception &exc) { - std::cout << "An int option can only receive a number" << std::endl; - throw std::exception(); - }; - break; - case FLOAT_CALLBACK: - try { - int int_val = std::stof(argv[++arg_number]); - ((void (*)(float)) this_option->callback)(int_val); - } catch (std::exception &exc) { - std::cout << "A float option can only receive a number" << std::endl; - throw std::exception(); - }; - break; - case BOOL_CALLBACK: - try { - bool int_val = (bool) std::stoi(argv[++arg_number]); - ((void (*)(bool)) this_option->callback)(int_val); - } catch (std::exception &exc) { - std::cout << "A bool option can only receive 0 or 1" << std::endl; - throw std::exception(); - }; - break; - case STRING_CALLBACK: - try { - ((void (*)(std::string)) this_option->callback)(argv[++arg_number]); - } catch (std::exception &exc) { - throw std::exception(); - }; - break; - case STRING: - std::cout << this_option->printval << std::endl; - break; - default: - this_option->callback(); - break; - } - } - } - if (std::string("--help") == std::string(argv[arg_number]) || - std::string("-h") == std::string(argv[arg_number])) { - help(); - } +option_list::option_list(std::string program_name) : program_name(program_name) +{ + { + internal_list = std::vector(); } } -void option_list::help() { +void option_list::add(const option_t &opt) { internal_list.push_back(opt); } + +void option_list::parse(int argc, char **argv) +{ + for (int arg_number = 0; arg_number < argc; ++arg_number) + { + for (std::vector::iterator this_option = internal_list.begin(); + this_option != internal_list.end(); + this_option++) + { + if (this_option->longform == std::string(argv[arg_number]) || + this_option->shortform == std::string(argv[arg_number])) + { + switch (this_option->option_type) + { + case VOID_CALLBACK: + this_option->callback(); + break; + case INT_CALLBACK: + try + { + int int_val = std::stoi(argv[++arg_number]); + ((void (*)(int))this_option->callback)(int_val); + } + catch (std::exception &exc) + { + std::cout << "An int option can only receive a number" << std::endl; + throw std::exception(); + }; + break; + case FLOAT_CALLBACK: + try + { + int int_val = std::stof(argv[++arg_number]); + ((void (*)(float))this_option->callback)(int_val); + } + catch (std::exception &exc) + { + std::cout << "A float option can only receive a number" << std::endl; + throw std::exception(); + }; + break; + case BOOL_CALLBACK: + try + { + bool int_val = (bool)std::stoi(argv[++arg_number]); + ((void (*)(bool))this_option->callback)(int_val); + } + catch (std::exception &exc) + { + std::cout << "A bool option can only receive 0 or 1" << std::endl; + throw std::exception(); + }; + break; + case STRING_CALLBACK: + try + { + ((void (*)(std::string))this_option->callback)(argv[++arg_number]); + } + catch (std::exception &exc) + { + throw std::exception(); + }; + break; + case STRING: + std::cout << this_option->printval << std::endl; + break; + default: + this_option->callback(); + break; + } + } + } + if (std::string("--help") == std::string(argv[arg_number]) || + std::string("-h") == std::string(argv[arg_number])) + { + help(); + } + } +} + +void option_list::help() +{ std::cout << program_name << std::endl; std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl; for (std::vector::iterator this_option = internal_list.begin(); this_option != internal_list.end(); - this_option++) { - std::string help_line(" "); - if (this_option->shortform == "-") { - help_line += this_option->longform + " "; - } else { - help_line += this_option->shortform + " [ " + this_option->longform + " ]"; - } + this_option++) + { + std::string help_line(" "); + if (this_option->shortform == "-") + { + help_line += this_option->longform + " "; + } + else + { + help_line += this_option->shortform + " [ " + this_option->longform + " ]"; + } - switch (help_line.size() / 8) { - case 0: - help_line += "\t\t\t\t"; - break; - case 1: - help_line += "\t\t\t"; - break; - case 2: - help_line += "\t\t"; - break; - case 3: - help_line += "\t"; - break; - default: - break; + switch (help_line.size() / 8) + { + case 0: + help_line += "\t\t\t\t"; + break; + case 1: + help_line += "\t\t\t"; + break; + case 2: + help_line += "\t\t"; + break; + case 3: + help_line += "\t"; + break; + default: + break; + } + help_line += this_option->msg; + std::cout << help_line << std::endl; } - help_line += this_option->msg; - std::cout << help_line << std::endl; - } } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.h index 30cb98210..da1e12821 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_option_helpers.h @@ -36,7 +36,8 @@ typedef enum STRING, } VOLK_OPTYPE; -class option_t { +class option_t +{ public: option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()); option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)); @@ -51,7 +52,6 @@ public: VOLK_OPTYPE option_type; std::string printval; void (*callback)(); - }; class option_list @@ -59,15 +59,16 @@ class option_list public: option_list(std::string program_name); - void add(const option_t & opt); + void add(const option_t &opt); void parse(int argc, char **argv); void help(); + private: std::string program_name; std::vector internal_list; }; -#endif //VOLK_VOLK_OPTION_HELPERS_H +#endif //VOLK_VOLK_OPTION_HELPERS_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.cc index 5b9a1a653..f59c0cb60 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.cc @@ -16,23 +16,22 @@ * along with GNSS-SDR. If not, see . */ -#include "kernel_tests.h" // for init_test_list -#include "qa_utils.h" // for volk_gnsssdr_test_results_t -#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t -#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t +#include "kernel_tests.h" // for init_test_list +#include "qa_utils.h" // for volk_gnsssdr_test_results_t +#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t +#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t #include "volk_gnsssdr_profile.h" -#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path -#include // for create_directories, exists -#include // for path, operator<< -#include // for filesystem -#include // for stat -#include // for size_t -#include // for operator<<, basic_ostream -#include // IWYU pragma: keep -#include // for map, map<>::iterator -#include // for pair -#include // for vector, vector<>::const_.. - +#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path +#include // for create_directories, exists +#include // for path, operator<< +#include // for filesystem +#include // for stat +#include // for size_t +#include // for operator<<, basic_ostream +#include // IWYU pragma: keep +#include // for map, map<>::iterator +#include // for pair +#include // for vector, vector<>::const_.. namespace fs = boost::filesystem; @@ -67,92 +66,112 @@ int main(int argc, char *argv[]) profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config))); try - { + { profile_options.parse(argc, argv); - } - catch(...) - { - return 1; - } + } + catch (...) + { + return 1; + } - for (int arg_number = 0; arg_number < argc; ++arg_number) { + for (int arg_number = 0; arg_number < argc; ++arg_number) + { if (std::string("--help") == std::string(argv[arg_number]) || - std::string("-h") == std::string(argv[arg_number])) { + std::string("-h") == std::string(argv[arg_number])) + { return 0; - } - } + } + } // Adding program options std::ofstream json_file; std::string config_file; - if ( json_filename != "" ) { - json_file.open( json_filename.c_str() ); - } + if (json_filename != "") + { + json_file.open(json_filename.c_str()); + } - if ( volk_config_path != "" ) { - config_file = volk_config_path + "/volk_config"; - } + if (volk_config_path != "") + { + config_file = volk_config_path + "/volk_config"; + } // Run tests std::vector results; - if(update_mode) { - if( config_file != "" ) read_results(&results, config_file); - else read_results(&results); - } + if (update_mode) + { + if (config_file != "") + read_results(&results, config_file); + else + read_results(&results); + } // Initialize the list of tests std::vector test_cases = init_test_list(test_params); // Iterate through list of tests running each one std::string substr_to_match(test_params.kernel_regex()); - for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { - bool regex_match = true; + for (unsigned int ii = 0; ii < test_cases.size(); ++ii) + { + bool regex_match = true; - volk_gnsssdr_test_case_t test_case = test_cases[ii]; - // if the kernel name matches regex then do the test - std::string test_case_name = test_case.name(); - if(test_case_name.find(substr_to_match) == std::string::npos) { - regex_match = false; - } - - // if we are in update mode check if we've already got results - // if we have any, then no need to test that kernel - bool update = true; - if(update_mode) { - for(unsigned int jj=0; jj < results.size(); ++jj) { - if(results[jj].name == test_case.name() || - results[jj].name == test_case.puppet_master_name()) { - update = false; - break; + volk_gnsssdr_test_case_t test_case = test_cases[ii]; + // if the kernel name matches regex then do the test + std::string test_case_name = test_case.name(); + if (test_case_name.find(substr_to_match) == std::string::npos) + { + regex_match = false; } - } - } - if( regex_match && update ) { - try { - run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), - test_case.test_parameters(), &results, test_case.puppet_master_name()); - } - catch (std::string &error) { - std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl; - } + // if we are in update mode check if we've already got results + // if we have any, then no need to test that kernel + bool update = true; + if (update_mode) + { + for (unsigned int jj = 0; jj < results.size(); ++jj) + { + if (results[jj].name == test_case.name() || + results[jj].name == test_case.puppet_master_name()) + { + update = false; + break; + } + } + } + + if (regex_match && update) + { + try + { + run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), + test_case.test_parameters(), &results, test_case.puppet_master_name()); + } + catch (std::string &error) + { + std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl; + } + } } - } // Output results according to provided options - if(json_filename != "") { - write_json(json_file, results); - json_file.close(); - } + if (json_filename != "") + { + write_json(json_file, results); + json_file.close(); + } - if(!dry_run) { - if(config_file != "") write_results(&results, false, config_file); - else write_results(&results, false); - } - else { - std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; - } + if (!dry_run) + { + if (config_file != "") + write_results(&results, false, config_file); + else + write_results(&results, false); + } + else + { + std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; + } } @@ -167,51 +186,55 @@ void read_results(std::vector *results) void read_results(std::vector *results, std::string path) { struct stat buffer; - bool config_status = (stat (path.c_str(), &buffer) == 0); + bool config_status = (stat(path.c_str(), &buffer) == 0); - if( config_status ) { - // a config exists and we are reading results from it - std::ifstream config(path.c_str()); - char config_line[256]; - while(config.getline(config_line, 255)) { - // tokenize the input line by kernel_name unaligned aligned - // then push back in the results vector with fields filled in + if (config_status) + { + // a config exists and we are reading results from it + std::ifstream config(path.c_str()); + char config_line[256]; + while (config.getline(config_line, 255)) + { + // tokenize the input line by kernel_name unaligned aligned + // then push back in the results vector with fields filled in - std::vector single_kernel_result; - std::string config_str(config_line); - std::size_t str_size = config_str.size(); - std::size_t found = 1; + std::vector single_kernel_result; + std::string config_str(config_line); + std::size_t str_size = config_str.size(); + std::size_t found = 1; - found = config_str.find(' '); - // Split line by spaces - while(found && found < str_size) { found = config_str.find(' '); - // kernel names MUST be less than 128 chars, which is - // a length restricted by volk/volk_prefs.c - // on the last token in the parsed string we won't find a space - // so make sure we copy at most 128 chars. - if(found > 127) { - found = 127; - } - str_size = config_str.size(); - char buffer[128] = {'\0'}; - config_str.copy(buffer, found + 1, 0); - buffer[found] = '\0'; - single_kernel_result.push_back(std::string(buffer)); - config_str.erase(0, found+1); - } + // Split line by spaces + while (found && found < str_size) + { + found = config_str.find(' '); + // kernel names MUST be less than 128 chars, which is + // a length restricted by volk/volk_prefs.c + // on the last token in the parsed string we won't find a space + // so make sure we copy at most 128 chars. + if (found > 127) + { + found = 127; + } + str_size = config_str.size(); + char buffer[128] = {'\0'}; + config_str.copy(buffer, found + 1, 0); + buffer[found] = '\0'; + single_kernel_result.push_back(std::string(buffer)); + config_str.erase(0, found + 1); + } - if(single_kernel_result.size() == 3) { - volk_gnsssdr_test_results_t kernel_result; - kernel_result.name = std::string(single_kernel_result[0]); - kernel_result.config_name = std::string(single_kernel_result[0]); - kernel_result.best_arch_u = std::string(single_kernel_result[1]); - kernel_result.best_arch_a = std::string(single_kernel_result[2]); - results->push_back(kernel_result); - } + if (single_kernel_result.size() == 3) + { + volk_gnsssdr_test_results_t kernel_result; + kernel_result.name = std::string(single_kernel_result[0]); + kernel_result.config_name = std::string(single_kernel_result[0]); + kernel_result.best_arch_u = std::string(single_kernel_result[1]); + kernel_result.best_arch_a = std::string(single_kernel_result[2]); + results->push_back(kernel_result); + } + } } - } - } void write_results(const std::vector *results, bool update_result) @@ -219,7 +242,7 @@ void write_results(const std::vector *results, bool char path[1024]; volk_gnsssdr_get_config_path(path); - write_results( results, update_result, std::string(path)); + write_results(results, update_result, std::string(path)); } void write_results(const std::vector *results, bool update_result, const std::string path) @@ -227,39 +250,44 @@ void write_results(const std::vector *results, bool const fs::path config_path(path); // Until we can update the config on a kernel by kernel basis // do not overwrite volk_gnsssdr_config when using a regex. - if (! fs::exists(config_path.branch_path())) - { - std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl; - fs::create_directories(config_path.branch_path()); - } + if (!fs::exists(config_path.branch_path())) + { + std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl; + fs::create_directories(config_path.branch_path()); + } std::ofstream config; - if(update_result) { - std::cout << "Updating " << path << " ..." << std::endl; - config.open(path.c_str(), std::ofstream::app); - if (!config.is_open()) { //either we don't have write access or we don't have the dir yet - std::cout << "Error opening file " << path << std::endl; - } - } - else { - std::cout << "Writing " << path << " ..." << std::endl; - config.open(path.c_str()); - if (!config.is_open()) { //either we don't have write access or we don't have the dir yet - std::cout << "Error opening file " << path << std::endl; + if (update_result) + { + std::cout << "Updating " << path << " ..." << std::endl; + config.open(path.c_str(), std::ofstream::app); + if (!config.is_open()) + { //either we don't have write access or we don't have the dir yet + std::cout << "Error opening file " << path << std::endl; + } } + else + { + std::cout << "Writing " << path << " ..." << std::endl; + config.open(path.c_str()); + if (!config.is_open()) + { //either we don't have write access or we don't have the dir yet + std::cout << "Error opening file " << path << std::endl; + } - config << "\ + config << "\ #this file is generated by volk_gnsssdr_profile.\n\ #the function name is followed by the preferred architecture.\n\ "; - } + } std::vector::const_iterator profile_results; - for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) { - config << profile_results->config_name << " " - << profile_results->best_arch_a << " " - << profile_results->best_arch_u << std::endl; - } + for (profile_results = results->begin(); profile_results != results->end(); ++profile_results) + { + config << profile_results->config_name << " " + << profile_results->best_arch_a << " " + << profile_results->best_arch_u << std::endl; + } config.close(); } @@ -270,43 +298,45 @@ void write_json(std::ofstream &json_file, std::vector::iterator result; - for(result = results.begin(); result != results.end(); ++result) { - json_file << " {" << std::endl; - json_file << " \"name\": \"" << result->name << "\"," << std::endl; - json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; - json_file << " \"iter\": " << result->iter << "," << std::endl; - json_file << " \"best_arch_a\": \"" << result->best_arch_a - << "\"," << std::endl; - json_file << " \"best_arch_u\": \"" << result->best_arch_u - << "\"," << std::endl; - json_file << " \"results\": {" << std::endl; - size_t results_len = result->results.size(); - size_t ri = 0; + for (result = results.begin(); result != results.end(); ++result) + { + json_file << " {" << std::endl; + json_file << " \"name\": \"" << result->name << "\"," << std::endl; + json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; + json_file << " \"iter\": " << result->iter << "," << std::endl; + json_file << " \"best_arch_a\": \"" << result->best_arch_a + << "\"," << std::endl; + json_file << " \"best_arch_u\": \"" << result->best_arch_u + << "\"," << std::endl; + json_file << " \"results\": {" << std::endl; + size_t results_len = result->results.size(); + size_t ri = 0; - std::map::iterator kernel_time_pair; - for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) { - volk_gnsssdr_test_time_t time = kernel_time_pair->second; - json_file << " \"" << time.name << "\": {" << std::endl; - json_file << " \"name\": \"" << time.name << "\"," << std::endl; - json_file << " \"time\": " << time.time << "," << std::endl; - json_file << " \"units\": \"" << time.units << "\"" << std::endl; - json_file << " }" ; - if(ri+1 != results_len) { - json_file << ","; - } + std::map::iterator kernel_time_pair; + for (kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) + { + volk_gnsssdr_test_time_t time = kernel_time_pair->second; + json_file << " \"" << time.name << "\": {" << std::endl; + json_file << " \"name\": \"" << time.name << "\"," << std::endl; + json_file << " \"time\": " << time.time << "," << std::endl; + json_file << " \"units\": \"" << time.units << "\"" << std::endl; + json_file << " }"; + if (ri + 1 != results_len) + { + json_file << ","; + } + json_file << std::endl; + ri++; + } + json_file << " }" << std::endl; + json_file << " }"; + if (i + 1 != len) + { + json_file << ","; + } json_file << std::endl; - ri++; + i++; } - json_file << " }" << std::endl; - json_file << " }"; - if(i+1 != len) { - json_file << ","; - } - json_file << std::endl; - i++; - } json_file << " ]" << std::endl; json_file << "}" << std::endl; } - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.h index 26ff1249b..0b1a6a46e 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/apps/volk_gnsssdr_profile.h @@ -27,10 +27,10 @@ * ------------------------------------------------------------------------- */ -#include // for bool -#include // for ofstream -#include // for string -#include // for vector +#include // for bool +#include // for ofstream +#include // for string +#include // for vector class volk_test_results_t; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturation_arithmetic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturation_arithmetic.h index 194bb46e3..77a6cc84d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturation_arithmetic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/saturation_arithmetic.h @@ -29,7 +29,7 @@ static inline int16_t sat_adds16i(int16_t x, int16_t y) { - int32_t res = (int32_t) x + (int32_t) y; + int32_t res = (int32_t)x + (int32_t)y; if (res < SHRT_MIN) res = SHRT_MIN; if (res > SHRT_MAX) res = SHRT_MAX; @@ -39,7 +39,7 @@ static inline int16_t sat_adds16i(int16_t x, int16_t y) static inline int16_t sat_muls16i(int16_t x, int16_t y) { - int32_t res = (int32_t) x * (int32_t) y; + int32_t res = (int32_t)x * (int32_t)y; if (res < SHRT_MIN) res = SHRT_MIN; if (res > SHRT_MAX) res = SHRT_MAX; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h index 809aa98f9..dbb67f986 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_avx_intrinsics.h @@ -30,38 +30,42 @@ static inline __m256 _mm256_complexmul_ps(__m256 x, __m256 y) { - __m256 yl, yh, tmp1, tmp2; - yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... - yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... - tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... - x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... - tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + __m256 yl, yh, tmp1, tmp2; + yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... + yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... + tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... + x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... + tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di } static inline __m256 -_mm256_conjugate_ps(__m256 x){ - const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); - return _mm256_xor_ps(x, conjugator); // conjugate y +_mm256_conjugate_ps(__m256 x) +{ + const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); + return _mm256_xor_ps(x, conjugator); // conjugate y } static inline __m256 -_mm256_complexconjugatemul_ps(__m256 x, __m256 y){ - y = _mm256_conjugate_ps(y); - return _mm256_complexmul_ps(x, y); +_mm256_complexconjugatemul_ps(__m256 x, __m256 y) +{ + y = _mm256_conjugate_ps(y); + return _mm256_complexmul_ps(x, y); } static inline __m256 -_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){ - __m256 complex1, complex2; - cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values - complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); - complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); - return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values +_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2) +{ + __m256 complex1, complex2; + cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values + complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); + complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); + return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values } -static inline __m256 _mm256_complexnormalise_ps( __m256 z ){ +static inline __m256 _mm256_complexnormalise_ps(__m256 z) +{ __m256 tmp1 = _mm256_mul_ps(z, z); __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1); tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); @@ -70,8 +74,9 @@ static inline __m256 _mm256_complexnormalise_ps( __m256 z ){ } static inline __m256 -_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){ - return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); +_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2) +{ + return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); } #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h index 24b6501b8..d97ce89b1 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_common.h @@ -28,14 +28,14 @@ // Cross-platform attribute macros not included in VOLK //////////////////////////////////////////////////////////////////////// #if defined __GNUC__ -# define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) -# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) +#define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) +#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) #elif _MSC_VER -# define __VOLK_GNSSSDR_PREFETCH(addr) -# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) +#define __VOLK_GNSSSDR_PREFETCH(addr) +#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) #else -# define __VOLK_GNSSSDR_PREFETCH(addr) -# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) +#define __VOLK_GNSSSDR_PREFETCH(addr) +#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) #endif #ifndef INCLUDED_LIBVOLK_COMMON_H @@ -45,45 +45,45 @@ // Cross-platform attribute macros //////////////////////////////////////////////////////////////////////// #if defined __GNUC__ -# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) -# define __VOLK_ATTR_UNUSED __attribute__((unused)) -# define __VOLK_ATTR_INLINE __attribute__((always_inline)) -# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) -# define __VOLK_ASM __asm__ -# define __VOLK_VOLATILE __volatile__ -# if __GNUC__ >= 4 -# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) -# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) -# else -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -# endif -#elif _MSC_VER -# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE __forceinline -# define __VOLK_ATTR_DEPRECATED __declspec(deprecated) -# define __VOLK_ATTR_EXPORT __declspec(dllexport) -# define __VOLK_ATTR_IMPORT __declspec(dllimport) -# define __VOLK_ASM __asm -# define __VOLK_VOLATILE +#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) +#define __VOLK_ATTR_UNUSED __attribute__((unused)) +#define __VOLK_ATTR_INLINE __attribute__((always_inline)) +#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) +#define __VOLK_ASM __asm__ +#define __VOLK_VOLATILE __volatile__ +#if __GNUC__ >= 4 +#define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) +#define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) #else -# define __VOLK_ATTR_ALIGNED(x) -# define __VOLK_ATTR_UNUSED -# define __VOLK_ATTR_INLINE -# define __VOLK_ATTR_DEPRECATED -# define __VOLK_ATTR_EXPORT -# define __VOLK_ATTR_IMPORT -# define __VOLK_ASM __asm__ -# define __VOLK_VOLATILE __volatile__ +#define __VOLK_ATTR_EXPORT +#define __VOLK_ATTR_IMPORT +#endif +#elif _MSC_VER +#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x)) +#define __VOLK_ATTR_UNUSED +#define __VOLK_ATTR_INLINE __forceinline +#define __VOLK_ATTR_DEPRECATED __declspec(deprecated) +#define __VOLK_ATTR_EXPORT __declspec(dllexport) +#define __VOLK_ATTR_IMPORT __declspec(dllimport) +#define __VOLK_ASM __asm +#define __VOLK_VOLATILE +#else +#define __VOLK_ATTR_ALIGNED(x) +#define __VOLK_ATTR_UNUSED +#define __VOLK_ATTR_INLINE +#define __VOLK_ATTR_DEPRECATED +#define __VOLK_ATTR_EXPORT +#define __VOLK_ATTR_IMPORT +#define __VOLK_ASM __asm__ +#define __VOLK_VOLATILE __volatile__ #endif //////////////////////////////////////////////////////////////////////// // Ignore annoying warnings in MSVC //////////////////////////////////////////////////////////////////////// #if defined(_MSC_VER) -# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data -# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' +#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data +#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2' #endif //////////////////////////////////////////////////////////////////////// @@ -91,11 +91,13 @@ // FIXME: due to the usage of complex.h, require gcc for c-linkage //////////////////////////////////////////////////////////////////////// #if defined(__cplusplus) && (__GNUC__) -# define __VOLK_DECL_BEGIN extern "C" { -# define __VOLK_DECL_END } +#define __VOLK_DECL_BEGIN \ + extern "C" \ + { +#define __VOLK_DECL_END } #else -# define __VOLK_DECL_BEGIN -# define __VOLK_DECL_END +#define __VOLK_DECL_BEGIN +#define __VOLK_DECL_END #endif //////////////////////////////////////////////////////////////////////// @@ -103,9 +105,9 @@ // http://gcc.gnu.org/wiki/Visibility //////////////////////////////////////////////////////////////////////// #ifdef volk_gnsssdr_EXPORTS -# define VOLK_API __VOLK_ATTR_EXPORT +#define VOLK_API __VOLK_ATTR_EXPORT #else -# define VOLK_API __VOLK_ATTR_IMPORT +#define VOLK_API __VOLK_ATTR_IMPORT #endif //////////////////////////////////////////////////////////////////////// @@ -121,35 +123,37 @@ #endif #endif -union bit128{ - uint8_t i8[16]; - uint16_t i16[8]; - uint32_t i[4]; - float f[4]; - double d[2]; +union bit128 +{ + uint8_t i8[16]; + uint16_t i16[8]; + uint32_t i[4]; + float f[4]; + double d[2]; - #ifdef LV_HAVE_SSE - __m128 float_vec; - #endif +#ifdef LV_HAVE_SSE + __m128 float_vec; +#endif - #ifdef LV_HAVE_SSE2 - __m128i int_vec; - __m128d double_vec; - #endif +#ifdef LV_HAVE_SSE2 + __m128i int_vec; + __m128d double_vec; +#endif }; -union bit256{ - uint8_t i8[32]; - uint16_t i16[16]; - uint32_t i[8]; - float f[8]; - double d[4]; +union bit256 +{ + uint8_t i8[32]; + uint16_t i16[16]; + uint32_t i[8]; + float f[8]; + double d[4]; - #ifdef LV_HAVE_AVX - __m256 float_vec; - __m256i int_vec; - __m256d double_vec; - #endif +#ifdef LV_HAVE_AVX + __m256 float_vec; + __m256i int_vec; + __m256d double_vec; +#endif }; #define bit128_p(x) ((union bit128 *)(x)) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h index 237266679..648eb26f9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_complex.h @@ -48,26 +48,34 @@ #include #include -typedef std::complex lv_8sc_t; +typedef std::complex lv_8sc_t; typedef std::complex lv_16sc_t; typedef std::complex lv_32sc_t; typedef std::complex lv_64sc_t; -typedef std::complex lv_32fc_t; -typedef std::complex lv_64fc_t; +typedef std::complex lv_32fc_t; +typedef std::complex lv_64fc_t; -template inline std::complex lv_cmake(const T &r, const T &i){ +template +inline std::complex lv_cmake(const T &r, const T &i) +{ return std::complex(r, i); } -template inline typename T::value_type lv_creal(const T &x){ +template +inline typename T::value_type lv_creal(const T &x) +{ return x.real(); } -template inline typename T::value_type lv_cimag(const T &x){ +template +inline typename T::value_type lv_cimag(const T &x) +{ return x.imag(); } -template inline T lv_conj(const T &x){ +template +inline T lv_conj(const T &x) +{ return std::conj(x); } @@ -80,14 +88,14 @@ template inline T lv_conj(const T &x){ #include -typedef char complex lv_8sc_t; -typedef short complex lv_16sc_t; -typedef long complex lv_32sc_t; -typedef long long complex lv_64sc_t; -typedef float complex lv_32fc_t; -typedef double complex lv_64fc_t; +typedef char complex lv_8sc_t; +typedef short complex lv_16sc_t; +typedef long complex lv_32sc_t; +typedef long long complex lv_64sc_t; +typedef float complex lv_32fc_t; +typedef double complex lv_64fc_t; -#define lv_cmake(r, i) ((r) + _Complex_I*(i)) +#define lv_cmake(r, i) ((r) + _Complex_I * (i)) // When GNUC is available, use the complex extensions. // The extensions always return the correct value type. diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h index 49aa561d1..0de07d600 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_neon_intrinsics.h @@ -27,30 +27,30 @@ #include -static inline float32x4_t vdivq_f32( float32x4_t num, float32x4_t den ) +static inline float32x4_t vdivq_f32(float32x4_t num, float32x4_t den) { - const float32x4_t q_inv0 = vrecpeq_f32( den ); - const float32x4_t q_step0 = vrecpsq_f32( q_inv0, den ); + const float32x4_t q_inv0 = vrecpeq_f32(den); + const float32x4_t q_step0 = vrecpsq_f32(q_inv0, den); - const float32x4_t q_inv1 = vmulq_f32( q_step0, q_inv0 ); - return vmulq_f32( num, q_inv1 ); + const float32x4_t q_inv1 = vmulq_f32(q_step0, q_inv0); + return vmulq_f32(num, q_inv1); } -static inline float32x4_t vsqrtq_f32( float32x4_t q_x ) +static inline float32x4_t vsqrtq_f32(float32x4_t q_x) { - const float32x4_t q_step_0 = vrsqrteq_f32( q_x ); + const float32x4_t q_step_0 = vrsqrteq_f32(q_x); // step - const float32x4_t q_step_parm0 = vmulq_f32( q_x, q_step_0 ); - const float32x4_t q_step_result0 = vrsqrtsq_f32( q_step_parm0, q_step_0 ); + const float32x4_t q_step_parm0 = vmulq_f32(q_x, q_step_0); + const float32x4_t q_step_result0 = vrsqrtsq_f32(q_step_parm0, q_step_0); // step - const float32x4_t q_step_1 = vmulq_f32( q_step_0, q_step_result0 ); - const float32x4_t q_step_parm1 = vmulq_f32( q_x, q_step_1 ); - const float32x4_t q_step_result1 = vrsqrtsq_f32( q_step_parm1, q_step_1 ); + const float32x4_t q_step_1 = vmulq_f32(q_step_0, q_step_result0); + const float32x4_t q_step_parm1 = vmulq_f32(q_x, q_step_1); + const float32x4_t q_step_result1 = vrsqrtsq_f32(q_step_parm1, q_step_1); // take the res - const float32x4_t q_step_2 = vmulq_f32( q_step_1, q_step_result1 ); + const float32x4_t q_step_2 = vmulq_f32(q_step_1, q_step_result1); // mul by x to get sqrt, not rsqrt - return vmulq_f32( q_x, q_step_2 ); + return vmulq_f32(q_x, q_step_2); } #endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h index bb03e4407..372079450 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_prefs.h @@ -32,9 +32,9 @@ __VOLK_DECL_BEGIN typedef struct volk_gnsssdr_arch_pref { - char name[128]; //name of the kernel - char impl_a[128]; //best aligned impl - char impl_u[128]; //best unaligned impl + char name[128]; //name of the kernel + char impl_a[128]; //best aligned impl + char impl_u[128]; //best unaligned impl } volk_gnsssdr_arch_pref_t; //////////////////////////////////////////////////////////////////////// diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sine_table.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sine_table.h index 90bd78569..4ba0bb631 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sine_table.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sine_table.h @@ -29,1030 +29,1030 @@ * max_error = 2.353084136763606e-06 */ static const float sine_table_10bits[1 << 10][2] = { -{ 2.925817799165007e-09, 7.219194364267018e-09 }, - { 2.925707643778599e-09, 2.526699001579799e-07 }, - { 2.925487337153070e-09, 1.191140162167675e-06 }, - { 2.925156887582842e-09, 3.284585035595589e-06 }, - { 2.924716307509151e-09, 6.994872605695784e-06 }, - { 2.924165613519592e-09, 1.278374920658798e-05 }, - { 2.923504826347475e-09, 2.111280464718590e-05 }, - { 2.922733970871080e-09, 3.244343744537165e-05 }, - { 2.921853076112655e-09, 4.723682007436170e-05 }, - { 2.920862175237416e-09, 6.595386421935634e-05 }, - { 2.919761305552202e-09, 8.905518605213658e-05 }, - { 2.918550508504146e-09, 1.170010715193098e-04 }, - { 2.917229829679050e-09, 1.502514416517192e-04 }, - { 2.915799318799769e-09, 1.892658178912071e-04 }, - { 2.914259029724184e-09, 2.345032874456615e-04 }, - { 2.912609020443340e-09, 2.864224686607020e-04 }, - { 2.910849353079123e-09, 3.454814764261432e-04 }, - { 2.908980093882049e-09, 4.121378876027343e-04 }, - { 2.907001313228646e-09, 4.868487064877691e-04 }, - { 2.904913085618902e-09, 5.700703303049837e-04 }, - { 2.902715489673383e-09, 6.622585147355725e-04 }, - { 2.900408608130373e-09, 7.638683394782519e-04 }, - { 2.897992527842612e-09, 8.753541738578119e-04 }, - { 2.895467339774186e-09, 9.971696424604937e-04 }, - { 2.892833138996999e-09, 1.129767590823255e-03 }, - { 2.890090024687216e-09, 1.273600051161478e-03 }, - { 2.887238100121550e-09, 1.429118208142094e-03 }, - { 2.884277472673313e-09, 1.596772364709564e-03 }, - { 2.881208253808507e-09, 1.777011907950626e-03 }, - { 2.878030559081432e-09, 1.970285275029487e-03 }, - { 2.874744508130554e-09, 2.177039919152579e-03 }, - { 2.871350224673798e-09, 2.397722275614272e-03 }, - { 2.867847836504030e-09, 2.632777727878843e-03 }, - { 2.864237475484149e-09, 2.882650573737405e-03 }, - { 2.860519277542297e-09, 3.147783991507308e-03 }, - { 2.856693382666432e-09, 3.428620006328931e-03 }, - { 2.852759934899389e-09, 3.725599456482154e-03 }, - { 2.848719082333207e-09, 4.039161959812243e-03 }, - { 2.844570977103752e-09, 4.369745880190706e-03 }, - { 2.840315775384800e-09, 4.717788294077374e-03 }, - { 2.835953637382310e-09, 5.083724957128360e-03 }, - { 2.831484727328322e-09, 5.467990270896617e-03 }, - { 2.826909213474759e-09, 5.871017249604038e-03 }, - { 2.822227268087134e-09, 6.293237486988512e-03 }, - { 2.817439067438018e-09, 6.735081123237729e-03 }, - { 2.812544791800534e-09, 7.196976811989608e-03 }, - { 2.807544625441273e-09, 7.679351687456759e-03 }, - { 2.802438756613836e-09, 8.182631331563162e-03 }, - { 2.797227377551135e-09, 8.707239741274575e-03 }, - { 2.791910684458716e-09, 9.253599295902304e-03 }, - { 2.786488877507140e-09, 9.822130724578715e-03 }, - { 2.780962160824228e-09, 1.041325307382490e-02 }, - { 2.775330742487884e-09, 1.102738367513773e-02 }, - { 2.769594834517682e-09, 1.166493811278924e-02 }, - { 2.763754652867477e-09, 1.232633019159818e-02 }, - { 2.757810417416620e-09, 1.301197190494069e-02 }, - { 2.751762351962413e-09, 1.372227340270610e-02 }, - { 2.745610684210923e-09, 1.445764295952962e-02 }, - { 2.739355645769094e-09, 1.521848694296229e-02 }, - { 2.732997472135539e-09, 1.600520978188769e-02 }, - { 2.726536402691907e-09, 1.681821393496225e-02 }, - { 2.719972680693777e-09, 1.765789985920713e-02 }, - { 2.713306553261610e-09, 1.852466597868779e-02 }, - { 2.706538271371373e-09, 1.941890865333146e-02 }, - { 2.699668089844909e-09, 2.034102214787814e-02 }, - { 2.692696267340880e-09, 2.129139860085272e-02 }, - { 2.685623066344263e-09, 2.227042799383416e-02 }, - { 2.678448753157212e-09, 2.327849812064098e-02 }, - { 2.671173597888530e-09, 2.431599455681316e-02 }, - { 2.663797874443630e-09, 2.538330062913108e-02 }, - { 2.656321860514457e-09, 2.648079738524795e-02 }, - { 2.648745837568575e-09, 2.760886356354952e-02 }, - { 2.641070090839117e-09, 2.876787556300114e-02 }, - { 2.633294909313421e-09, 2.995820741329835e-02 }, - { 2.625420585722845e-09, 3.118023074495535e-02 }, - { 2.617447416531143e-09, 3.243431475972608e-02 }, - { 2.609375701923643e-09, 3.372082620101990e-02 }, - { 2.601205745795833e-09, 3.504012932452527e-02 }, - { 2.592937855741933e-09, 3.639258586895711e-02 }, - { 2.584572343043400e-09, 3.777855502693250e-02 }, - { 2.576109522656942e-09, 3.919839341605197e-02 }, - { 2.567549713203028e-09, 4.065245505002102e-02 }, - { 2.558893236953688e-09, 4.214109131001403e-02 }, - { 2.550140419820252e-09, 4.366465091617666e-02 }, - { 2.541291591341445e-09, 4.522347989919473e-02 }, - { 2.532347084670572e-09, 4.681792157215026e-02 }, - { 2.523307236563343e-09, 4.844831650239501e-02 }, - { 2.514172387364900e-09, 5.011500248369893e-02 }, - { 2.504942880997064e-09, 5.181831450849345e-02 }, - { 2.495619064945627e-09, 5.355858474024022e-02 }, - { 2.486201290246928e-09, 5.533614248606705e-02 }, - { 2.476689911475047e-09, 5.715131416942842e-02 }, - { 2.467085286727668e-09, 5.900442330315692e-02 }, - { 2.457387777613798e-09, 6.089579046229943e-02 }, - { 2.447597749239101e-09, 6.282573325755320e-02 }, - { 2.437715570192557e-09, 6.479456630859221e-02 }, - { 2.427741612532542e-09, 6.680260121764925e-02 }, - { 2.417676251773166e-09, 6.885014654319160e-02 }, - { 2.407519866869294e-09, 7.093750777401114e-02 }, - { 2.397272840203310e-09, 7.306498730310884e-02 }, - { 2.386935557569868e-09, 7.523288440214027e-02 }, - { 2.376508408161815e-09, 7.744149519577415e-02 }, - { 2.365991784555363e-09, 7.969111263635709e-02 }, - { 2.355386082695641e-09, 8.198202647865405e-02 }, - { 2.344691701881232e-09, 8.431452325495814e-02 }, - { 2.333909044749407e-09, 8.668888625021409e-02 }, - { 2.323038517261246e-09, 8.910539547731611e-02 }, - { 2.312080528685971e-09, 9.156432765274414e-02 }, - { 2.301035491585642e-09, 9.406595617227698e-02 }, - { 2.289903821799651e-09, 9.661055108691619e-02 }, - { 2.278685938428940e-09, 9.919837907903295e-02 }, - { 2.267382263820762e-09, 1.018297034385580e-01 }, - { 2.255993223551837e-09, 1.045047840397028e-01 }, - { 2.244519246413220e-09, 1.072238773174577e-01 }, - { 2.232960764393620e-09, 1.099872362446146e-01 }, - { 2.221318212663309e-09, 1.127951103088245e-01 }, - { 2.209592029557811e-09, 1.156477454898748e-01 }, - { 2.197782656561395e-09, 1.185453842371912e-01 }, - { 2.185890538290176e-09, 1.214882654476019e-01 }, - { 2.173916122475606e-09, 1.244766244431883e-01 }, - { 2.161859859947797e-09, 1.275106929493488e-01 }, - { 2.149722204618256e-09, 1.305906990731841e-01 }, - { 2.137503613462743e-09, 1.337168672820376e-01 }, - { 2.125204546504321e-09, 1.368894183821595e-01 }, - { 2.112825466795944e-09, 1.401085694976751e-01 }, - { 2.100366840402933e-09, 1.433745340497602e-01 }, - { 2.087829136385612e-09, 1.466875217359607e-01 }, - { 2.075212826781308e-09, 1.500477385098620e-01 }, - { 2.062518386587093e-09, 1.534553865607503e-01 }, - { 2.049746293741359e-09, 1.569106642937665e-01 }, - { 2.036897029106193e-09, 1.604137663100403e-01 }, - { 2.023971076449323e-09, 1.639648833871233e-01 }, - { 2.010968922425217e-09, 1.675642024598467e-01 }, - { 1.997891056557933e-09, 1.712119066008896e-01 }, - { 1.984737971221581e-09, 1.749081750021970e-01 }, - { 1.971510161622434e-09, 1.786531829561379e-01 }, - { 1.958208125780130e-09, 1.824471018371070e-01 }, - { 1.944832364508511e-09, 1.862900990834311e-01 }, - { 1.931383381397782e-09, 1.901823381790926e-01 }, - { 1.917861682794392e-09, 1.941239786363039e-01 }, - { 1.904267777782611e-09, 1.981151759777950e-01 }, - { 1.890602178165317e-09, 2.021560817195309e-01 }, - { 1.876865398444616e-09, 2.062468433536743e-01 }, - { 1.863057955802572e-09, 2.103876043317229e-01 }, - { 1.849180370081465e-09, 2.145785040479915e-01 }, - { 1.835233163764673e-09, 2.188196778231083e-01 }, - { 1.821216861956509e-09, 2.231112568880342e-01 }, - { 1.807131992362945e-09, 2.274533683680190e-01 }, - { 1.792979085271234e-09, 2.318461352671018e-01 }, - { 1.778758673530482e-09, 2.362896764525300e-01 }, - { 1.764471292530943e-09, 2.407841066397789e-01 }, - { 1.750117480184598e-09, 2.453295363773890e-01 }, - { 1.735697776904342e-09, 2.499260720324433e-01 }, - { 1.721212725583874e-09, 2.545738157760434e-01 }, - { 1.706662871577097e-09, 2.592728655691494e-01 }, - { 1.692048762677849e-09, 2.640233151485341e-01 }, - { 1.677370949099090e-09, 2.688252540131204e-01 }, - { 1.662629983452104e-09, 2.736787674105404e-01 }, - { 1.647826420726167e-09, 2.785839363237506e-01 }, - { 1.632960818266680e-09, 2.835408374583758e-01 }, - { 1.618033735755429e-09, 2.885495432295704e-01 }, - { 1.603045735188609e-09, 2.936101217498361e-01 }, - { 1.587997380855918e-09, 2.987226368167127e-01 }, - { 1.572889239319430e-09, 3.038871479007593e-01 }, - { 1.557721879392051e-09, 3.091037101339017e-01 }, - { 1.542495872116447e-09, 3.143723742978435e-01 }, - { 1.527211790743024e-09, 3.196931868130269e-01 }, - { 1.511870210708909e-09, 3.250661897274744e-01 }, - { 1.496471709615926e-09, 3.304914207062036e-01 }, - { 1.481016867208896e-09, 3.359689130207621e-01 }, - { 1.465506265353924e-09, 3.414986955389885e-01 }, - { 1.449940488016384e-09, 3.470807927151147e-01 }, - { 1.434320121238994e-09, 3.527152245800635e-01 }, - { 1.418645753119802e-09, 3.584020067320109e-01 }, - { 1.402917973789838e-09, 3.641411503272979e-01 }, - { 1.387137375391042e-09, 3.699326620714776e-01 }, - { 1.371304552054134e-09, 3.757765442106153e-01 }, - { 1.355420099875958e-09, 3.816727945230153e-01 }, - { 1.339484616897137e-09, 3.876214063110671e-01 }, - { 1.323498703079580e-09, 3.936223683933865e-01 }, - { 1.307462960283922e-09, 3.996756650972121e-01 }, - { 1.291377992246768e-09, 4.057812762511174e-01 }, - { 1.275244404558188e-09, 4.119391771778626e-01 }, - { 1.259062804638585e-09, 4.181493386877248e-01 }, - { 1.242833801715929e-09, 4.244117270719281e-01 }, - { 1.226558006803155e-09, 4.307263040962509e-01 }, - { 1.210236032674760e-09, 4.370930269951803e-01 }, - { 1.193868493843725e-09, 4.435118484661861e-01 }, - { 1.177456006538695e-09, 4.499827166641340e-01 }, - { 1.160999188680582e-09, 4.565055751961679e-01 }, - { 1.144498659859216e-09, 4.630803631168164e-01 }, - { 1.127955041310214e-09, 4.697070149232604e-01 }, - { 1.111368955891417e-09, 4.763854605510119e-01 }, - { 1.094741028059551e-09, 4.831156253697562e-01 }, - { 1.078071883846871e-09, 4.898974301794375e-01 }, - { 1.061362150836978e-09, 4.967307912069362e-01 }, - { 1.044612458142151e-09, 5.036156201023686e-01 }, - { 1.027823436378632e-09, 5.105518239364775e-01 }, - { 1.010995717643647e-09, 5.175393051975563e-01 }, - { 9.941299354913699e-10, 5.245779617890562e-01 }, - { 9.772267249089968e-10, 5.316676870274011e-01 }, - { 9.602867222926046e-10, 5.388083696401416e-01 }, - { 9.433105654240147e-10, 5.459998937639375e-01 }, - { 9.262988934458084e-10, 5.532421389435711e-01 }, - { 9.092523468378193e-10, 5.605349801305876e-01 }, - { 8.921715673928355e-10, 5.678782876825250e-01 }, - { 8.750571981926701e-10, 5.752719273622372e-01 }, - { 8.579098835836508e-10, 5.827157603377209e-01 }, - { 8.407302691522673e-10, 5.902096431821322e-01 }, - { 8.235190017016133e-10, 5.977534278737073e-01 }, - { 8.062767292259225e-10, 6.053469617967722e-01 }, - { 7.890041008871165e-10, 6.129900877421282e-01 }, - { 7.717017669898175e-10, 6.206826439083659e-01 }, - { 7.543703789572603e-10, 6.284244639030392e-01 }, - { 7.370105893063053e-10, 6.362153767444958e-01 }, - { 7.196230516231919e-10, 6.440552068636356e-01 }, - { 7.022084205389746e-10, 6.519437741060674e-01 }, - { 6.847673517046416e-10, 6.598808937346672e-01 }, - { 6.673005017664976e-10, 6.678663764322770e-01 }, - { 6.498085283416530e-10, 6.759000283046127e-01 }, - { 6.322920899929834e-10, 6.839816508836737e-01 }, - { 6.147518462045659e-10, 6.921110411311926e-01 }, - { 5.971884573565851e-10, 7.002879914425926e-01 }, - { 5.796025847007168e-10, 7.085122896509806e-01 }, - { 5.619948903351406e-10, 7.167837190315758e-01 }, - { 5.443660371796048e-10, 7.251020583063744e-01 }, - { 5.267166889504394e-10, 7.334670816491009e-01 }, - { 5.090475101356742e-10, 7.418785586903696e-01 }, - { 4.913591659698399e-10, 7.503362545232619e-01 }, - { 4.736523224091392e-10, 7.588399297089872e-01 }, - { 4.559276461062478e-10, 7.673893402829834e-01 }, - { 4.381858043851147e-10, 7.759842377612828e-01 }, - { 4.204274652161870e-10, 7.846243691469355e-01 }, - { 4.026532971908398e-10, 7.933094769370790e-01 }, - { 3.848639694963359e-10, 8.020392991300200e-01 }, - { 3.670601518910503e-10, 8.108135692324444e-01 }, - { 3.492425146784233e-10, 8.196320162675177e-01 }, - { 3.314117286825031e-10, 8.284943647824689e-01 }, - { 3.135684652223755e-10, 8.374003348569865e-01 }, - { 2.957133960867535e-10, 8.463496421118015e-01 }, - { 2.778471935089361e-10, 8.553419977173513e-01 }, - { 2.599705301412391e-10, 8.643771084029740e-01 }, - { 2.420840790301135e-10, 8.734546764660205e-01 }, - { 2.241885135902046e-10, 8.825743997817682e-01 }, - { 2.062845075795238e-10, 8.917359718130367e-01 }, - { 1.883727350736140e-10, 9.009390816205823e-01 }, - { 1.704538704408269e-10, 9.101834138731877e-01 }, - { 1.525285883160648e-10, 9.194686488588080e-01 }, - { 1.345975635762696e-10, 9.287944624950824e-01 }, - { 1.166614713141648e-10, 9.381605263410157e-01 }, - { 9.872098681369190e-11, 9.475665076080466e-01 }, - { 8.077678552380464e-11, 9.570120691722380e-01 }, - { 6.282954303364090e-11, 9.664968695860140e-01 }, - { 4.487993504668797e-11, 9.760205630906909e-01 }, - { 2.692863735553042e-11, 9.855827996289697e-01 }, - { 8.976325816439114e-12, 9.951832248577780e-01 }, - { -8.976323676304494e-12, 1.004821480161519e+00 }, - { -2.692863521550168e-11, 1.014497202665280e+00 }, - { -4.487993290681805e-11, 1.024210025248670e+00 }, - { -6.282954089398273e-11, 1.033959576559617e+00 }, - { -8.077678338451706e-11, 1.043745481028715e+00 }, - { -9.872098467477489e-11, 1.053567358883467e+00 }, - { -1.166614691757772e-10, 1.063424826163223e+00 }, - { -1.345975614383584e-10, 1.073317494734013e+00 }, - { -1.525285861788948e-10, 1.083244972303963e+00 }, - { -1.704538683042922e-10, 1.093206862438572e+00 }, - { -1.883727329379793e-10, 1.103202764576806e+00 }, - { -2.062845054446831e-10, 1.113232274046796e+00 }, - { -2.241885114563697e-10, 1.123294982082432e+00 }, - { -2.420840768973375e-10, 1.133390475839767e+00 }, - { -2.599705280096278e-10, 1.143518338413855e+00 }, - { -2.778471913784365e-10, 1.153678148855860e+00 }, - { -2.957133939575774e-10, 1.163869482190458e+00 }, - { -3.135684630945758e-10, 1.174091909433296e+00 }, - { -3.314117265561857e-10, 1.184344997608959e+00 }, - { -3.492425125535882e-10, 1.194628309769018e+00 }, - { -3.670601497678034e-10, 1.204941405010466e+00 }, - { -3.848639673748360e-10, 1.215283838494269e+00 }, - { -4.026532950710339e-10, 1.225655161464298e+00 }, - { -4.204274630982869e-10, 1.236054921266445e+00 }, - { -4.381858022691734e-10, 1.246482661367958e+00 }, - { -4.559276439922654e-10, 1.256937921377146e+00 }, - { -4.736523202972214e-10, 1.267420237063216e+00 }, - { -4.913591638600925e-10, 1.277929140376502e+00 }, - { -5.090475080282032e-10, 1.288464159468706e+00 }, - { -5.267166868452449e-10, 1.299024818713528e+00 }, - { -5.443660350768455e-10, 1.309610638727845e+00 }, - { -5.619948882348695e-10, 1.320221136392390e+00 }, - { -5.796025826029868e-10, 1.330855824873457e+00 }, - { -5.971884552615020e-10, 1.341514213644420e+00 }, - { -6.147518441122357e-10, 1.352195808507556e+00 }, - { -6.322920879034590e-10, 1.362900111616144e+00 }, - { -6.498085262549874e-10, 1.373626621496939e+00 }, - { -6.673004996827436e-10, 1.384374833072571e+00 }, - { -6.847673496239581e-10, 1.395144237684605e+00 }, - { -7.022084184613616e-10, 1.405934323116231e+00 }, - { -7.196230495488082e-10, 1.416744573616104e+00 }, - { -7.370105872352039e-10, 1.427574469921397e+00 }, - { -7.543703768894941e-10, 1.438423489281758e+00 }, - { -7.717017649255453e-10, 1.449291105483472e+00 }, - { -7.890040988262324e-10, 1.460176788873383e+00 }, - { -8.062767271686383e-10, 1.471080006383765e+00 }, - { -8.235189996479819e-10, 1.482000221556656e+00 }, - { -8.407302671024475e-10, 1.492936894569018e+00 }, - { -8.579098815375368e-10, 1.503889482257845e+00 }, - { -8.750571961505266e-10, 1.514857438145604e+00 }, - { -8.921715653546624e-10, 1.525840212465756e+00 }, - { -9.092523448036167e-10, 1.536837252188703e+00 }, - { -9.262988914157881e-10, 1.547848001047890e+00 }, - { -9.433105633981766e-10, 1.558871899565883e+00 }, - { -9.602867202711075e-10, 1.569908385081254e+00 }, - { -9.772267228916820e-10, 1.580956891774897e+00 }, - { -9.941299334786078e-10, 1.592016850697478e+00 }, - { -1.010995715635332e-09, 1.603087689796053e+00 }, - { -1.027823434374870e-09, 1.614168833942028e+00 }, - { -1.044612456143047e-09, 1.625259704958335e+00 }, - { -1.061362148842745e-09, 1.636359721647526e+00 }, - { -1.078071881857297e-09, 1.647468299819543e+00 }, - { -1.094741026074900e-09, 1.658584852320419e+00 }, - { -1.111368953911690e-09, 1.669708789060341e+00 }, - { -1.127955039335462e-09, 1.680839517042381e+00 }, - { -1.144498657889600e-09, 1.691976440391624e+00 }, - { -1.160999186716154e-09, 1.703118960383971e+00 }, - { -1.177456004579561e-09, 1.714266475475616e+00 }, - { -1.193868491889832e-09, 1.725418381332405e+00 }, - { -1.210236030726319e-09, 1.736574070859850e+00 }, - { -1.226558004860220e-09, 1.747732934232508e+00 }, - { -1.242833799778447e-09, 1.758894358924547e+00 }, - { -1.259062802706714e-09, 1.770057729740021e+00 }, - { -1.275244402631982e-09, 1.781222428842935e+00 }, - { -1.291377990326492e-09, 1.792387835788660e+00 }, - { -1.307462958369363e-09, 1.803553327553897e+00 }, - { -1.323498701170897e-09, 1.814718278568759e+00 }, - { -1.339484614994490e-09, 1.825882060747428e+00 }, - { -1.355420097979292e-09, 1.837044043519582e+00 }, - { -1.371304550163662e-09, 1.848203593862598e+00 }, - { -1.387137373506711e-09, 1.859360076332671e+00 }, - { -1.402917971911754e-09, 1.870512853097495e+00 }, - { -1.418645751248018e-09, 1.881661283967967e+00 }, - { -1.434320119373722e-09, 1.892804726431080e+00 }, - { -1.449940486157623e-09, 1.903942535681972e+00 }, - { -1.465506263501516e-09, 1.915074064656886e+00 }, - { -1.481016865363264e-09, 1.926198664066737e+00 }, - { -1.496471707776859e-09, 1.937315682428795e+00 }, - { -1.511870208876724e-09, 1.948424466101625e+00 }, - { -1.527211788917509e-09, 1.959524359317042e+00 }, - { -1.542495870297867e-09, 1.970614704215133e+00 }, - { -1.557721877580406e-09, 1.981694840876775e+00 }, - { -1.572889237514880e-09, 1.992764107358707e+00 }, - { -1.587997379058514e-09, 2.003821839726753e+00 }, - { -1.603045733398246e-09, 2.014867372090665e+00 }, - { -1.618033733972424e-09, 2.025900036638798e+00 }, - { -1.632960816490822e-09, 2.036919163671778e+00 }, - { -1.647826418957721e-09, 2.047924081638631e+00 }, - { -1.662629981691070e-09, 2.058914117170269e+00 }, - { -1.677370947345626e-09, 2.069888595116115e+00 }, - { -1.692048760931849e-09, 2.080846838577820e+00 }, - { -1.706662869838827e-09, 2.091788168946183e+00 }, - { -1.721212723853279e-09, 2.102711905935372e+00 }, - { -1.735697775181424e-09, 2.113617367619504e+00 }, - { -1.750117478469621e-09, 2.124503870468520e+00 }, - { -1.764471290823748e-09, 2.135370729383332e+00 }, - { -1.778758671831281e-09, 2.146217257733207e+00 }, - { -1.792979083579974e-09, 2.157042767390815e+00 }, - { -1.807131990679890e-09, 2.167846568770014e+00 }, - { -1.821216860281448e-09, 2.178627970860822e+00 }, - { -1.835233162097977e-09, 2.189386281268046e+00 }, - { -1.849180368423027e-09, 2.200120806246095e+00 }, - { -1.863057954152340e-09, 2.210830850737588e+00 }, - { -1.876865396802907e-09, 2.221515718409926e+00 }, - { -1.890602176531920e-09, 2.232174711691990e+00 }, - { -1.904267776157843e-09, 2.242807131812679e+00 }, - { -1.917861681178094e-09, 2.253412278837029e+00 }, - { -1.931383379790273e-09, 2.263989451705295e+00 }, - { -1.944832362909578e-09, 2.274537948269257e+00 }, - { -1.958208124189984e-09, 2.285057065331676e+00 }, - { -1.971510160041235e-09, 2.295546098682665e+00 }, - { -1.984737969649064e-09, 2.306004343138794e+00 }, - { -1.997891054994522e-09, 2.316431092581699e+00 }, - { -2.010968920870647e-09, 2.326825639994779e+00 }, - { -2.023971074903858e-09, 2.337187277503834e+00 }, - { -2.036897027569834e-09, 2.347515296413520e+00 }, - { -2.049746292214264e-09, 2.357808987247877e+00 }, - { -2.062518385069210e-09, 2.368067639787542e+00 }, - { -2.075212825272584e-09, 2.378290543109652e+00 }, - { -2.087829134886364e-09, 2.388476985626922e+00 }, - { -2.100366838912949e-09, 2.398626255125417e+00 }, - { -2.112825465315542e-09, 2.408737638805759e+00 }, - { -2.125204545033289e-09, 2.418810423320288e+00 }, - { -2.137503612001452e-09, 2.428843894814472e+00 }, - { -2.149722203166389e-09, 2.438837338964302e+00 }, - { -2.161859858505829e-09, 2.448790041018174e+00 }, - { -2.173916121043380e-09, 2.458701285834241e+00 }, - { -2.185890536867478e-09, 2.468570357921585e+00 }, - { -2.197782655148702e-09, 2.478396541480230e+00 }, - { -2.209592028154913e-09, 2.488179120439544e+00 }, - { -2.221318211270522e-09, 2.497917378500214e+00 }, - { -2.232960763010574e-09, 2.507610599172123e+00 }, - { -2.244519245040444e-09, 2.517258065817044e+00 }, - { -2.255993222189014e-09, 2.526859061686102e+00 }, - { -2.267382262468209e-09, 2.536412869962689e+00 }, - { -2.278685937086658e-09, 2.545918773800664e+00 }, - { -2.289903820467374e-09, 2.555376056366064e+00 }, - { -2.301035490263848e-09, 2.564784000877677e+00 }, - { -2.312080527374447e-09, 2.574141890646339e+00 }, - { -2.323038515960257e-09, 2.583449009117307e+00 }, - { -2.333909043458635e-09, 2.592704639909166e+00 }, - { -2.344691700601153e-09, 2.601908066856634e+00 }, - { -2.355386081425938e-09, 2.611058574048749e+00 }, - { -2.365991783296513e-09, 2.620155445872768e+00 }, - { -2.376508406913500e-09, 2.629197967052127e+00 }, - { -2.386935556332088e-09, 2.638185422689490e+00 }, - { -2.397272838976436e-09, 2.647117098307332e+00 }, - { -2.407519865653114e-09, 2.655992279887846e+00 }, - { -2.417676250567891e-09, 2.664810253915885e+00 }, - { -2.427741611338014e-09, 2.673570307418169e+00 }, - { -2.437715569009093e-09, 2.682271728006635e+00 }, - { -2.447597748066437e-09, 2.690913803917100e+00 }, - { -2.457387776452357e-09, 2.699495824053297e+00 }, - { -2.467085285577292e-09, 2.708017078025636e+00 }, - { -2.476689910335470e-09, 2.716476856194105e+00 }, - { -2.486201289118733e-09, 2.724874449709689e+00 }, - { -2.495619063828443e-09, 2.733209150554255e+00 }, - { -2.504942879891263e-09, 2.741480251583985e+00 }, - { -2.514172386270163e-09, 2.749687046568741e+00 }, - { -2.523307235480146e-09, 2.757828830235740e+00 }, - { -2.532347083598520e-09, 2.765904898308531e+00 }, - { -2.541291590280960e-09, 2.773914547551261e+00 }, - { -2.550140418771202e-09, 2.781857075807392e+00 }, - { -2.558893235915887e-09, 2.789731782043156e+00 }, - { -2.567549712176927e-09, 2.797537966388929e+00 }, - { -2.576109521642196e-09, 2.805274930179221e+00 }, - { -2.584572342040407e-09, 2.812941975996573e+00 }, - { -2.592937854750428e-09, 2.820538407710556e+00 }, - { -2.601205744816134e-09, 2.828063530521908e+00 }, - { -2.609375700955458e-09, 2.835516651001539e+00 }, - { -2.617447415574869e-09, 2.842897077134583e+00 }, - { -2.625420584778350e-09, 2.850204118359573e+00 }, - { -2.633294908380520e-09, 2.857437085611509e+00 }, - { -2.641070089918234e-09, 2.864595291363663e+00 }, - { -2.648745836659391e-09, 2.871678049666939e+00 }, - { -2.656321859617343e-09, 2.878684676194483e+00 }, - { -2.663797873558322e-09, 2.885614488280000e+00 }, - { -2.671173597015318e-09, 2.892466804962122e+00 }, - { -2.678448752295859e-09, 2.899240947023252e+00 }, - { -2.685623065495139e-09, 2.905936237033475e+00 }, - { -2.692696266503800e-09, 2.912551999389617e+00 }, - { -2.699668089019767e-09, 2.919087560358171e+00 }, - { -2.706538270558513e-09, 2.925542248116882e+00 }, - { -2.713306552460767e-09, 2.931915392794031e+00 }, - { -2.719972679905295e-09, 2.938206326512581e+00 }, - { -2.726536401915442e-09, 2.944414383428562e+00 }, - { -2.732997471371516e-09, 2.950538899775061e+00 }, - { -2.739355645017194e-09, 2.956579213900666e+00 }, - { -2.745610683471516e-09, 2.962534666313284e+00 }, - { -2.751762351235315e-09, 2.968404599718795e+00 }, - { -2.757810416701751e-09, 2.974188359063684e+00 }, - { -2.763754652165128e-09, 2.979885291576143e+00 }, - { -2.769594833827588e-09, 2.985494746805227e+00 }, - { -2.775330741810390e-09, 2.991016076664491e+00 }, - { -2.780962160159068e-09, 2.996448635469842e+00 }, - { -2.786488876854607e-09, 3.001791779983262e+00 }, - { -2.791910683818570e-09, 3.007044869450794e+00 }, - { -2.797227376923695e-09, 3.012207265645876e+00 }, - { -2.802438755998943e-09, 3.017278332907412e+00 }, - { -2.807544624838820e-09, 3.022257438182037e+00 }, - { -2.812544791210840e-09, 3.027143951064684e+00 }, - { -2.817439066860792e-09, 3.031937243837070e+00 }, - { -2.822227267522746e-09, 3.036636691510884e+00 }, - { -2.826909212922864e-09, 3.041241671864994e+00 }, - { -2.831484726789317e-09, 3.045751565488710e+00 }, - { -2.835953636855826e-09, 3.050165755818853e+00 }, - { -2.840315774871260e-09, 3.054483629182857e+00 }, - { -2.844570976602957e-09, 3.058704574835744e+00 }, - { -2.848719081844986e-09, 3.062827985002047e+00 }, - { -2.852759934424164e-09, 3.066853254915581e+00 }, - { -2.856693382203833e-09, 3.070779782857041e+00 }, - { -2.860519277092708e-09, 3.074606970196721e+00 }, - { -2.864237475047239e-09, 3.078334221430809e+00 }, - { -2.867847836080156e-09, 3.081960944223928e+00 }, - { -2.871350224262603e-09, 3.085486549445314e+00 }, - { -2.874744507732462e-09, 3.088910451211251e+00 }, - { -2.878030558696270e-09, 3.092232066921130e+00 }, - { -2.881208253436038e-09, 3.095450817298478e+00 }, - { -2.884277472313999e-09, 3.098566126429974e+00 }, - { -2.887238099774968e-09, 3.101577421802070e+00 }, - { -2.890090024353816e-09, 3.104484134342861e+00 }, - { -2.892833138676371e-09, 3.107285698457308e+00 }, - { -2.895467339466766e-09, 3.109981552069083e+00 }, - { -2.897992527547963e-09, 3.112571136655481e+00 }, - { -2.900408607848946e-09, 3.115053897289195e+00 }, - { -2.902715489404992e-09, 3.117429282673042e+00 }, - { -2.904913085363323e-09, 3.119696745180238e+00 }, - { -2.907001312986328e-09, 3.121855740892224e+00 }, - { -2.908980093652563e-09, 3.123905729634218e+00 }, - { -2.910849352862924e-09, 3.125846175016163e+00 }, - { -2.912609020239985e-09, 3.127676544466606e+00 }, - { -2.914259029534118e-09, 3.129396309273659e+00 }, - { -2.915799318622574e-09, 3.131004944618667e+00 }, - { -2.917229829515169e-09, 3.132501929616775e+00 }, - { -2.918550508353347e-09, 3.133886747350606e+00 }, - { -2.919761305414294e-09, 3.135158884909254e+00 }, - { -2.920862175112829e-09, 3.136317833424958e+00 }, - { -2.921853076000972e-09, 3.137363088107359e+00 }, - { -2.922733970772719e-09, 3.138294148283254e+00 }, - { -2.923504826262027e-09, 3.139110517429204e+00 }, - { -2.924165613447473e-09, 3.139811703211207e+00 }, - { -2.924716307449950e-09, 3.140397217517018e+00 }, - { -2.925156887536978e-09, 3.140866576495489e+00 }, - { -2.925487337120335e-09, 3.141219300588825e+00 }, - { -2.925707643758784e-09, 3.141454914570261e+00 }, - { -2.925817799158535e-09, 3.141572947579352e+00 }, - { -2.925817799171455e-09, 3.141572933154836e+00 }, - { -2.925707643798390e-09, 3.141454409272987e+00 }, - { -2.925487337185779e-09, 3.141216918378770e+00 }, - { -2.925156887628892e-09, 3.140860007424112e+00 }, - { -2.924716307568119e-09, 3.140383227898687e+00 }, - { -2.924165613591896e-09, 3.139786135867868e+00 }, - { -2.923504826432903e-09, 3.139068292003385e+00 }, - { -2.922733970969412e-09, 3.138229261619561e+00 }, - { -2.921853076224321e-09, 3.137268614707029e+00 }, - { -2.920862175361976e-09, 3.136185925964038e+00 }, - { -2.919761305690083e-09, 3.134980774833275e+00 }, - { -2.918550508654911e-09, 3.133652745531368e+00 }, - { -2.917229829843137e-09, 3.132201427085629e+00 }, - { -2.915799318976726e-09, 3.130626413363146e+00 }, - { -2.914259029914435e-09, 3.128927303107136e+00 }, - { -2.912609020646661e-09, 3.127103699965947e+00 }, - { -2.910849353295315e-09, 3.125155212527586e+00 }, - { -2.908980094111509e-09, 3.123081454351802e+00 }, - { -2.907001313470937e-09, 3.120882043999591e+00 }, - { -2.904913085874448e-09, 3.118556605068443e+00 }, - { -2.902715489941767e-09, 3.116104766219928e+00 }, - { -2.900408608411958e-09, 3.113526161214776e+00 }, - { -2.897992528137022e-09, 3.110820428940251e+00 }, - { -2.895467340081818e-09, 3.107987213444579e+00 }, - { -2.892833139317615e-09, 3.105026163964191e+00 }, - { -2.890090025020589e-09, 3.101936934956479e+00 }, - { -2.887238100468092e-09, 3.098719186130021e+00 }, - { -2.884277473032614e-09, 3.095372582472161e+00 }, - { -2.881208254180937e-09, 3.091896794282404e+00 }, - { -2.878030559466594e-09, 3.088291497198199e+00 }, - { -2.874744508528832e-09, 3.084556372228054e+00 }, - { -2.871350225084755e-09, 3.080691105776848e+00 }, - { -2.867847836928063e-09, 3.076695389678615e+00 }, - { -2.864237475921086e-09, 3.072568921221621e+00 }, - { -2.860519277991847e-09, 3.068311403179147e+00 }, - { -2.856693383129018e-09, 3.063922543837792e+00 }, - { -2.852759935374575e-09, 3.059402057023109e+00 }, - { -2.848719082821403e-09, 3.054749662130841e+00 }, - { -2.844570977604520e-09, 3.049965084150782e+00 }, - { -2.840315775898525e-09, 3.045048053697736e+00 }, - { -2.835953637908582e-09, 3.039998307034967e+00 }, - { -2.831484727867511e-09, 3.034815586104635e+00 }, - { -2.826909214026628e-09, 3.029499638550941e+00 }, - { -2.822227268651470e-09, 3.024050217748861e+00 }, - { -2.817439068015245e-09, 3.018467082830179e+00 }, - { -2.812544792390175e-09, 3.012749998707001e+00 }, - { -2.807544626043751e-09, 3.006898736100911e+00 }, - { -2.802438757228650e-09, 3.000913071564665e+00 }, - { -2.797227378178760e-09, 2.994792787510961e+00 }, - { -2.791910685098702e-09, 2.988537672233504e+00 }, - { -2.786488878159805e-09, 2.982147519935565e+00 }, - { -2.780962161489413e-09, 2.975622130750641e+00 }, - { -2.775330743165298e-09, 2.968961310769028e+00 }, - { -2.769594835207775e-09, 2.962164872061613e+00 }, - { -2.763754653569747e-09, 2.955232632701135e+00 }, - { -2.757810418131543e-09, 2.948164416789036e+00 }, - { -2.751762352689432e-09, 2.940960054474719e+00 }, - { -2.745610684950541e-09, 2.933619381982341e+00 }, - { -2.739355646520809e-09, 2.926142241629213e+00 }, - { -2.732997472899722e-09, 2.918528481852205e+00 }, - { -2.726536403468318e-09, 2.910777957226018e+00 }, - { -2.719972681482232e-09, 2.902890528487386e+00 }, - { -2.713306554062453e-09, 2.894866062556452e+00 }, - { -2.706538272184154e-09, 2.886704432555728e+00 }, - { -2.699668090670078e-09, 2.878405517834426e+00 }, - { -2.692696268177908e-09, 2.869969203985464e+00 }, - { -2.685623067193599e-09, 2.861395382869544e+00 }, - { -2.678448754018380e-09, 2.852683952631486e+00 }, - { -2.671173598761847e-09, 2.843834817723832e+00 }, - { -2.663797875328991e-09, 2.834847888922988e+00 }, - { -2.656321861411517e-09, 2.825723083350459e+00 }, - { -2.648745838477759e-09, 2.816460324492298e+00 }, - { -2.641070091759922e-09, 2.807059542215146e+00 }, - { -2.633294910246296e-09, 2.797520672788269e+00 }, - { -2.625420586667340e-09, 2.787843658897949e+00 }, - { -2.617447417487602e-09, 2.778028449668942e+00 }, - { -2.609375702891616e-09, 2.768075000678399e+00 }, - { -2.601205746775692e-09, 2.757983273976943e+00 }, - { -2.592937856733464e-09, 2.747753238101915e+00 }, - { -2.584572344046340e-09, 2.737384868096553e+00 }, - { -2.576109523671634e-09, 2.726878145526201e+00 }, - { -2.567549714229129e-09, 2.716233058492422e+00 }, - { -2.558893237991435e-09, 2.705449601651722e+00 }, - { -2.550140420869302e-09, 2.694527776227857e+00 }, - { -2.541291592402089e-09, 2.683467590030445e+00 }, - { -2.532347085742440e-09, 2.672269057466213e+00 }, - { -2.523307237646751e-09, 2.660932199557362e+00 }, - { -2.514172388459584e-09, 2.649457043952206e+00 }, - { -2.504942882102813e-09, 2.637843624941622e+00 }, - { -2.495619066062810e-09, 2.626091983472908e+00 }, - { -2.486201291375123e-09, 2.614202167160335e+00 }, - { -2.476689912614465e-09, 2.602174230302269e+00 }, - { -2.467085287878098e-09, 2.590008233889805e+00 }, - { -2.457387778775451e-09, 2.577704245623143e+00 }, - { -2.447597750411553e-09, 2.565262339920002e+00 }, - { -2.437715571376127e-09, 2.552682597931055e+00 }, - { -2.427741613727123e-09, 2.539965107548168e+00 }, - { -2.417676252978335e-09, 2.527109963417675e+00 }, - { -2.407519868085581e-09, 2.514117266951687e+00 }, - { -2.397272841430131e-09, 2.500987126335739e+00 }, - { -2.386935558807595e-09, 2.487719656543254e+00 }, - { -2.376508409410024e-09, 2.474314979341178e+00 }, - { -2.365991785814531e-09, 2.460773223303822e+00 }, - { -2.355386083965131e-09, 2.447094523817833e+00 }, - { -2.344691703161363e-09, 2.433279023095734e+00 }, - { -2.333909046040126e-09, 2.419326870180582e+00 }, - { -2.323038518562289e-09, 2.405238220956597e+00 }, - { -2.312080529997549e-09, 2.391013238157397e+00 }, - { -2.301035492907384e-09, 2.376652091371587e+00 }, - { -2.289903823131822e-09, 2.362154957053137e+00 }, - { -2.278685939771276e-09, 2.347522018525197e+00 }, - { -2.267382265173420e-09, 2.332753465990296e+00 }, - { -2.255993224914501e-09, 2.317849496533128e+00 }, - { -2.244519247786155e-09, 2.302810314130351e+00 }, - { -2.232960765776561e-09, 2.287636129652823e+00 }, - { -2.221318214056095e-09, 2.272327160873552e+00 }, - { -2.209592030960763e-09, 2.256883632472565e+00 }, - { -2.197782657974034e-09, 2.241305776039511e+00 }, - { -2.185890539712767e-09, 2.225593830081461e+00 }, - { -2.173916123907886e-09, 2.209748040023618e+00 }, - { -2.161859861389976e-09, 2.193768658216360e+00 }, - { -2.149722206070124e-09, 2.177655943935795e+00 }, - { -2.137503614923981e-09, 2.161410163388424e+00 }, - { -2.125204547975352e-09, 2.145031589714984e+00 }, - { -2.112825468276292e-09, 2.128520502989477e+00 }, - { -2.100366841892917e-09, 2.111877190225612e+00 }, - { -2.087829137884807e-09, 2.095101945374541e+00 }, - { -2.075212828290086e-09, 2.078195069329960e+00 }, - { -2.062518388104923e-09, 2.061156869925600e+00 }, - { -2.049746295268559e-09, 2.043987661939897e+00 }, - { -2.036897030642658e-09, 2.026687767092888e+00 }, - { -2.023971077994576e-09, 2.009257514048162e+00 }, - { -2.010968923979840e-09, 1.991697238413571e+00 }, - { -1.997891058121344e-09, 1.974007282737320e+00 }, - { -1.984737972794098e-09, 1.956187996511354e+00 }, - { -1.971510163203686e-09, 1.938239736166060e+00 }, - { -1.958208127370276e-09, 1.920162865072273e+00 }, - { -1.944832366107339e-09, 1.901957753535934e+00 }, - { -1.931383383005451e-09, 1.883624778799427e+00 }, - { -1.917861684410531e-09, 1.865164325035177e+00 }, - { -1.904267779407432e-09, 1.846576783346324e+00 }, - { -1.890602179798714e-09, 1.827862551760622e+00 }, - { -1.876865400086483e-09, 1.809022035228338e+00 }, - { -1.863057957452539e-09, 1.790055645617624e+00 }, - { -1.849180371740008e-09, 1.770963801711725e+00 }, - { -1.835233165431475e-09, 1.751746929201178e+00 }, - { -1.821216863631569e-09, 1.732405460681919e+00 }, - { -1.807131994045840e-09, 1.712939835648088e+00 }, - { -1.792979086962494e-09, 1.693350500488565e+00 }, - { -1.778758675229683e-09, 1.673637908477153e+00 }, - { -1.764471294238191e-09, 1.653802519770021e+00 }, - { -1.750117481899733e-09, 1.633844801396848e+00 }, - { -1.735697778626995e-09, 1.613765227254186e+00 }, - { -1.721212727314574e-09, 1.593564278099856e+00 }, - { -1.706662873315474e-09, 1.573242441540939e+00 }, - { -1.692048764423848e-09, 1.552800212030258e+00 }, - { -1.677370950852395e-09, 1.532238090855187e+00 }, - { -1.662629985213192e-09, 1.511556586131055e+00 }, - { -1.647826422494560e-09, 1.490756212788764e+00 }, - { -1.632960820042537e-09, 1.469837492568651e+00 }, - { -1.618033737538645e-09, 1.448800954008929e+00 }, - { -1.603045736978760e-09, 1.427647132435469e+00 }, - { -1.587997382653428e-09, 1.406376569953373e+00 }, - { -1.572889241124034e-09, 1.384989815432507e+00 }, - { -1.557721881203696e-09, 1.363487424499449e+00 }, - { -1.542495873934815e-09, 1.341869959524515e+00 }, - { -1.527211792568486e-09, 1.320137989611176e+00 }, - { -1.511870212541253e-09, 1.298292090581491e+00 }, - { -1.496471711454994e-09, 1.276332844965754e+00 }, - { -1.481016869054634e-09, 1.254260841988828e+00 }, - { -1.465506267206068e-09, 1.232076677556547e+00 }, - { -1.449940489875303e-09, 1.209780954243628e+00 }, - { -1.434320123104372e-09, 1.187374281276747e+00 }, - { -1.418645754991533e-09, 1.164857274523495e+00 }, - { -1.402917975667710e-09, 1.142230556475749e+00 }, - { -1.387137377275425e-09, 1.119494756236361e+00 }, - { -1.371304553944712e-09, 1.096650509501278e+00 }, - { -1.355420101772623e-09, 1.073698458546610e+00 }, - { -1.339484618799891e-09, 1.050639252211352e+00 }, - { -1.323498704988051e-09, 1.027473545880543e+00 }, - { -1.307462962198534e-09, 1.004202001471034e+00 }, - { -1.291377994167204e-09, 9.808252874104182e-01 }, - { -1.275244406484394e-09, 9.573440786237052e-01 }, - { -1.259062806570190e-09, 9.337590565128454e-01 }, - { -1.242833803653464e-09, 9.100709089414796e-01 }, - { -1.226558008746195e-09, 8.862803302125812e-01 }, - { -1.210236034623253e-09, 8.623880210538113e-01 }, - { -1.193868495797618e-09, 8.383946885959868e-01 }, - { -1.177456008497777e-09, 8.143010463544786e-01 }, - { -1.160999190645010e-09, 7.901078142102129e-01 }, - { -1.144498661828833e-09, 7.658157183877095e-01 }, - { -1.127955043284965e-09, 7.414254914366063e-01 }, - { -1.111368957870986e-09, 7.169378722095157e-01 }, - { -1.094741030044308e-09, 6.923536058430697e-01 }, - { -1.078071885836393e-09, 6.676734437331688e-01 }, - { -1.061362152831423e-09, 6.428981435165511e-01 }, - { -1.044612460141255e-09, 6.180284690466404e-01 }, - { -1.027823438382183e-09, 5.930651903718045e-01 }, - { -1.010995719652015e-09, 5.680090837138436e-01 }, - { -9.941299375042378e-10, 5.428609314418970e-01 }, - { -9.772267269262058e-10, 5.176215220520872e-01 }, - { -9.602867243141016e-10, 4.922916501421032e-01 }, - { -9.433105674499058e-10, 4.668721163885412e-01 }, - { -9.262988954758817e-10, 4.413637275202624e-01 }, - { -9.092523488719689e-10, 4.157672962958654e-01 }, - { -8.921715694311144e-10, 3.900836414778084e-01 }, - { -8.750572002347607e-10, 3.643135878065193e-01 }, - { -8.579098856296589e-10, 3.384579659762392e-01 }, - { -8.407302712022458e-10, 3.125176126069478e-01 }, - { -8.235190037551917e-10, 2.864933702193017e-01 }, - { -8.062767312831008e-10, 2.603860872080448e-01 }, - { -7.890041029479477e-10, 2.341966178147619e-01 }, - { -7.717017690542486e-10, 2.079258220999725e-01 }, - { -7.543703810250266e-10, 1.815745659161734e-01 }, - { -7.370105913774597e-10, 1.551437208801425e-01 }, - { -7.196230536974697e-10, 1.286341643433767e-01 }, - { -7.022084226165876e-10, 1.020467793657360e-01 }, - { -6.847673537853251e-10, 7.538245468350446e-02 }, - { -6.673005038502516e-10, 4.864208468284503e-02 }, - { -6.498085304282128e-10, 2.182656936863137e-02 }, - { -6.322920920826137e-10, -5.063185663820913e-03 }, - { -6.147518482969490e-10, -3.202626926150343e-02 }, - { -5.971884594516681e-10, -5.906176474160862e-02 }, - { -5.796025867984469e-10, -8.616874992366363e-02 }, - { -5.619948924353588e-10, -1.133462971605448e-01 }, - { -5.443660392823640e-10, -1.405934733692621e-01 }, - { -5.267166910556339e-10, -1.679093400638023e-01 }, - { -5.090475122431451e-10, -1.952929533862739e-01 }, - { -4.913591680795342e-10, -2.227433641394564e-01 }, - { -4.736523245210571e-10, -2.502596178194491e-01 }, - { -4.559276482202303e-10, -2.778407546490776e-01 }, - { -4.381858065011618e-10, -3.054858096104932e-01 }, - { -4.204274673340870e-10, -3.331938124792702e-01 }, - { -4.026532993105397e-10, -3.609637878577768e-01 }, - { -3.848639716178888e-10, -3.887947552098022e-01 }, - { -3.670601540142443e-10, -4.166857288948674e-01 }, - { -3.492425168032583e-10, -4.446357182029681e-01 }, - { -3.314117308088734e-10, -4.726437273896633e-01 }, - { -3.135684673501752e-10, -5.007087557112619e-01 }, - { -2.957133982159296e-10, -5.288297974607742e-01 }, - { -2.778471956393828e-10, -5.570058420037128e-01 }, - { -2.599705322729564e-10, -5.852358738143247e-01 }, - { -2.420840811628366e-10, -6.135188725122560e-01 }, - { -2.241885157240923e-10, -6.418538128986450e-01 }, - { -2.062845097142585e-10, -6.702396649949099e-01 }, - { -1.883727372093546e-10, -6.986753940779493e-01 }, - { -1.704538725773087e-10, -7.271599607197149e-01 }, - { -1.525285904532877e-10, -7.556923208240308e-01 }, - { -1.345975657140748e-10, -7.842714256651911e-01 }, - { -1.166614734526054e-10, -8.128962219265712e-01 }, - { -9.872098895260891e-11, -8.415656517393372e-01 }, - { -8.077678766314517e-11, -8.702786527215916e-01 }, - { -6.282954517324612e-11, -8.990341580176152e-01 }, - { -4.487993718655790e-11, -9.278310963373758e-01 }, - { -2.692863949561210e-11, -9.566683919968972e-01 }, - { -8.976327956520795e-12, -9.855449649582175e-01 }, - { 8.976321536169872e-12, -1.014459730869357e+00 }, - { 2.692863307547294e-11, -1.043411601105914e+00 }, - { 4.487993076694813e-11, -1.072399482811314e+00 }, - { 6.282953875437751e-11, -1.101422278938424e+00 }, - { 8.077678124517653e-11, -1.130478888291020e+00 }, - { 9.872098253591082e-11, -1.159568205565684e+00 }, - { 1.166614670373367e-10, -1.188689121393192e+00 }, - { 1.345975593005002e-10, -1.217840522381901e+00 }, - { 1.525285840416718e-10, -1.247021291159495e+00 }, - { 1.704538661678104e-10, -1.276230306415868e+00 }, - { 1.883727308022916e-10, -1.305466442946703e+00 }, - { 2.062845033098954e-10, -1.334728571696106e+00 }, - { 2.241885093225349e-10, -1.364015559800721e+00 }, - { 2.420840747645085e-10, -1.393326270633325e+00 }, - { 2.599705258779635e-10, -1.422659563847049e+00 }, - { 2.778471892479898e-10, -1.452014295419243e+00 }, - { 2.957133918284542e-10, -1.481389317696831e+00 }, - { 3.135684609667761e-10, -1.510783479440191e+00 }, - { 3.314117244297624e-10, -1.540195625869043e+00 }, - { 3.492425104288060e-10, -1.569624598707558e+00 }, - { 3.670601476445565e-10, -1.599069236228850e+00 }, - { 3.848639652533361e-10, -1.628528373302631e+00 }, - { 4.026532929512281e-10, -1.658000841439269e+00 }, - { 4.204274609803869e-10, -1.687485468837799e+00 }, - { 4.381858001531792e-10, -1.716981080430596e+00 }, - { 4.559276418782829e-10, -1.746486497931567e+00 }, - { 4.736523181853565e-10, -1.776000539882225e+00 }, - { 4.913591617503452e-10, -1.805522021699094e+00 }, - { 5.090475059206794e-10, -1.835049755721194e+00 }, - { 5.267166847401562e-10, -1.864582551257262e+00 }, - { 5.443660329740862e-10, -1.894119214633676e+00 }, - { 5.619948861345454e-10, -1.923658549242818e+00 }, - { 5.796025805053097e-10, -1.953199355591180e+00 }, - { 5.971884531664190e-10, -1.982740431347091e+00 }, - { 6.147518420199055e-10, -2.012280571390674e+00 }, - { 6.322920858139346e-10, -2.041818567861395e+00 }, - { 6.498085241682158e-10, -2.071353210208005e+00 }, - { 6.673004975990425e-10, -2.100883285238127e+00 }, - { 6.847673475432746e-10, -2.130407577166309e+00 }, - { 7.022084163838545e-10, -2.159924867664933e+00 }, - { 7.196230474743716e-10, -2.189433935913779e+00 }, - { 7.370105851640495e-10, -2.218933558650552e+00 }, - { 7.543703748217808e-10, -2.248422510220072e+00 }, - { 7.717017628611672e-10, -2.277899562625407e+00 }, - { 7.890040967654542e-10, -2.307363485579104e+00 }, - { 8.062767251113011e-10, -2.336813046552684e+00 }, - { 8.235189975944034e-10, -2.366247010829556e+00 }, - { 8.407302650525749e-10, -2.395664141553858e+00 }, - { 8.579098794915287e-10, -2.425063199784153e+00 }, - { 8.750571941082773e-10, -2.454442944543319e+00 }, - { 8.921715633164894e-10, -2.483802132872044e+00 }, - { 9.092523427695200e-10, -2.513139519878584e+00 }, - { 9.262988893857148e-10, -2.542453858792682e+00 }, - { 9.433105613723914e-10, -2.571743901017465e+00 }, - { 9.602867182493987e-10, -2.601008396180870e+00 }, - { 9.772267208744730e-10, -2.630246092190425e+00 }, - { 9.941299314658458e-10, -2.659455735283526e+00 }, - { 1.010995713627070e-09, -2.688636070081818e+00 }, - { 1.027823432371055e-09, -2.717785839644439e+00 }, - { 1.044612454143997e-09, -2.746903785521352e+00 }, - { 1.061362146848353e-09, -2.775988647805256e+00 }, - { 1.078071879867828e-09, -2.805039165187255e+00 }, - { 1.094741024090249e-09, -2.834054075009077e+00 }, - { 1.111368951931856e-09, -2.863032113318052e+00 }, - { 1.127955037360817e-09, -2.891972014920939e+00 }, - { 1.144498655920037e-09, -2.920872513436805e+00 }, - { 1.160999184751779e-09, -2.949732341353290e+00 }, - { 1.177456002620215e-09, -2.978550230079517e+00 }, - { 1.193868489936097e-09, -3.007324910002949e+00 }, - { 1.210236028777826e-09, -3.036055110540183e+00 }, - { 1.226558002917232e-09, -3.064739560196251e+00 }, - { 1.242833797841123e-09, -3.093376986616735e+00 }, - { 1.259062800774685e-09, -3.121966116643377e+00 }, - { 1.275244400705935e-09, -3.150505676371791e+00 }, - { 1.291377988406056e-09, -3.178994391202159e+00 }, - { 1.307462956454857e-09, -3.207430985899192e+00 }, - { 1.323498699262108e-09, -3.235814184645077e+00 }, - { 1.339484613091842e-09, -3.264142711097884e+00 }, - { 1.355420096082785e-09, -3.292415288443373e+00 }, - { 1.371304548273191e-09, -3.320630639454825e+00 }, - { 1.387137371622433e-09, -3.348787486547389e+00 }, - { 1.402917970033511e-09, -3.376884551834256e+00 }, - { 1.418645749376393e-09, -3.404920557184582e+00 }, - { 1.434320117508396e-09, -3.432894224276359e+00 }, - { 1.449940484298756e-09, -3.460804274656981e+00 }, - { 1.465506261649108e-09, -3.488649429796768e+00 }, - { 1.481016863517580e-09, -3.516428411149154e+00 }, - { 1.496471705937951e-09, -3.544139940202303e+00 }, - { 1.511870207044433e-09, -3.571782738540999e+00 }, - { 1.527211787092206e-09, -3.599355527901174e+00 }, - { 1.542495868479076e-09, -3.626857030226671e+00 }, - { 1.557721875768920e-09, -3.654285967729458e+00 }, - { 1.572889235710329e-09, -3.681641062941412e+00 }, - { 1.587997377261005e-09, -3.708921038776707e+00 }, - { 1.603045731607830e-09, -3.736124618586623e+00 }, - { 1.618033732189314e-09, -3.763250526218862e+00 }, - { 1.632960814715177e-09, -3.790297486071938e+00 }, - { 1.647826417189275e-09, -3.817264223155802e+00 }, - { 1.662629979930247e-09, -3.844149463148589e+00 }, - { 1.677370945591844e-09, -3.870951932452996e+00 }, - { 1.692048759186008e-09, -3.897670358257890e+00 }, - { 1.706662868100504e-09, -3.924303468590212e+00 }, - { 1.721212722122685e-09, -3.950849992378278e+00 }, - { 1.735697773458400e-09, -3.977308659506432e+00 }, - { 1.750117476754591e-09, -4.003678200876669e+00 }, - { 1.764471289116712e-09, -4.029957348461003e+00 }, - { 1.778758670132079e-09, -4.056144835364877e+00 }, - { 1.792979081888926e-09, -4.082239395882965e+00 }, - { 1.807131988996465e-09, -4.108239765556996e+00 }, - { 1.821216858606652e-09, -4.134144681236933e+00 }, - { 1.835233160431175e-09, -4.159952881133585e+00 }, - { 1.849180366764537e-09, -4.185663104882633e+00 }, - { 1.863057952502055e-09, -4.211274093599509e+00 }, - { 1.876865395161145e-09, -4.236784589940537e+00 }, - { 1.890602174898734e-09, -4.262193338157148e+00 }, - { 1.904267774533022e-09, -4.287499084158302e+00 }, - { 1.917861679562008e-09, -4.312700575567174e+00 }, - { 1.931383378182392e-09, -4.337796561778708e+00 }, - { 1.944832361310856e-09, -4.362785794021793e+00 }, - { 1.958208122599839e-09, -4.387667025411434e+00 }, - { 1.971510158459931e-09, -4.412439011013396e+00 }, - { 1.984737968076495e-09, -4.437100507898339e+00 }, - { 1.997891053431005e-09, -4.461650275204912e+00 }, - { 2.010968919316289e-09, -4.486087074191693e+00 }, - { 2.023971073358447e-09, -4.510409668301784e+00 }, - { 2.036897026033634e-09, -4.534616823217992e+00 }, - { 2.049746290686799e-09, -4.558707306921882e+00 }, - { 2.062518383551274e-09, -4.582679889754607e+00 }, - { 2.075212823764071e-09, -4.606533344469879e+00 }, - { 2.087829133387063e-09, -4.630266446298172e+00 }, - { 2.100366837422912e-09, -4.653877973001258e+00 }, - { 2.112825463835087e-09, -4.677366704934605e+00 }, - { 2.125204543562522e-09, -4.700731425099899e+00 }, - { 2.137503610540056e-09, -4.723970919208608e+00 }, - { 2.149722201714786e-09, -4.747083975738060e+00 }, - { 2.161859857063438e-09, -4.770069385989595e+00 }, - { 2.173916119610994e-09, -4.792925944149308e+00 }, - { 2.185890535445098e-09, -4.815652447340950e+00 }, - { 2.197782653735957e-09, -4.838247695689436e+00 }, - { 2.209592026751962e-09, -4.860710492376411e+00 }, - { 2.221318209877576e-09, -4.883039643700314e+00 }, - { 2.232960761627846e-09, -4.905233959130168e+00 }, - { 2.244519243667616e-09, -4.927292251368517e+00 }, - { 2.255993220826402e-09, -4.949213336406265e+00 }, - { 2.267382261115285e-09, -4.970996033581527e+00 }, - { 2.278685935744269e-09, -4.992639165639563e+00 }, - { 2.289903819135414e-09, -5.014141558784778e+00 }, - { 2.301035488942000e-09, -5.035502042744443e+00 }, - { 2.312080526062763e-09, -5.056719450823151e+00 }, - { 2.323038514659161e-09, -5.077792619963239e+00 }, - { 2.333909042168180e-09, -5.098720390796817e+00 }, - { 2.344691699320969e-09, -5.119501607709159e+00 }, - { 2.355386080156553e-09, -5.140135118892792e+00 }, - { 2.365991782037187e-09, -5.160619776404897e+00 }, - { 2.376508405665132e-09, -5.180954436227641e+00 }, - { 2.386935555094626e-09, -5.201137958319343e+00 }, - { 2.397272837749508e-09, -5.221169206676762e+00 }, - { 2.407519864436774e-09, -5.241047049389645e+00 }, - { 2.417676249362563e-09, -5.260770358700167e+00 }, - { 2.427741610143750e-09, -5.280338011053974e+00 }, - { 2.437715567825576e-09, -5.299748887163106e+00 }, - { 2.447597746894037e-09, -5.319001872058887e+00 }, - { 2.457387775290440e-09, -5.338095855149190e+00 }, - { 2.467085284426756e-09, -5.357029730277389e+00 }, - { 2.476689909196263e-09, -5.375802395772283e+00 }, - { 2.486201287990485e-09, -5.394412754510426e+00 }, - { 2.495619062711154e-09, -5.412859713968929e+00 }, - { 2.504942878785408e-09, -5.431142186284682e+00 }, - { 2.514172385175743e-09, -5.449259088303476e+00 }, - { 2.523307234396791e-09, -5.467209341642627e+00 }, - { 2.532347082526785e-09, -5.484991872743321e+00 }, - { 2.541291589219998e-09, -5.502605612925014e+00 }, - { 2.550140417722072e-09, -5.520049498445633e+00 }, - { 2.558893234878378e-09, -5.537322470548212e+00 }, - { 2.567549711150773e-09, -5.554423475524196e+00 }, - { 2.576109520627371e-09, -5.571351464763084e+00 }, - { 2.584572341037361e-09, -5.588105394812198e+00 }, - { 2.592937853759161e-09, -5.604684227423386e+00 }, - { 2.601205743836355e-09, -5.621086929615246e+00 }, - { 2.609375699987564e-09, -5.637312473723475e+00 }, - { 2.617447414618146e-09, -5.653359837454964e+00 }, - { 2.625420583833750e-09, -5.669228003945694e+00 }, - { 2.633294907447937e-09, -5.684915961806963e+00 }, - { 2.641070088997271e-09, -5.700422705186584e+00 }, - { 2.648745835750128e-09, -5.715747233817712e+00 }, - { 2.656321858720176e-09, -5.730888553077074e+00 }, - { 2.663797872673252e-09, -5.745845674030161e+00 }, - { 2.671173596142054e-09, -5.760617613492118e+00 }, - { 2.678448751434797e-09, -5.775203394076705e+00 }, - { 2.685623064645538e-09, -5.789602044248679e+00 }, - { 2.692696265666640e-09, -5.803812598380606e+00 }, - { 2.699668088194915e-09, -5.817834096797069e+00 }, - { 2.706538269745573e-09, -5.831665585834668e+00 }, - { 2.713306551659817e-09, -5.845306117889361e+00 }, - { 2.719972679116734e-09, -5.858754751472542e+00 }, - { 2.726536401139295e-09, -5.872010551255358e+00 }, - { 2.732997470607439e-09, -5.885072588127400e+00 }, - { 2.739355644265558e-09, -5.897939939244211e+00 }, - { 2.745610682731633e-09, -5.910611688078208e+00 }, - { 2.751762350508137e-09, -5.923086924473290e+00 }, - { 2.757810415987146e-09, -5.935364744687794e+00 }, - { 2.763754651462700e-09, -5.947444251452243e+00 }, - { 2.769594833137415e-09, -5.959324554015538e+00 }, - { 2.775330741132843e-09, -5.971004768198829e+00 }, - { 2.780962159494174e-09, -5.982484016437981e+00 }, - { 2.786488876202047e-09, -5.993761427840588e+00 }, - { 2.791910683178690e-09, -6.004836138231525e+00 }, - { 2.797227376295779e-09, -6.015707290202086e+00 }, - { 2.802438755383971e-09, -6.026374033162623e+00 }, - { 2.807544624236659e-09, -6.036835523383457e+00 }, - { 2.812544790621093e-09, -6.047090924050914e+00 }, - { 2.817439066283459e-09, -6.057139405311101e+00 }, - { 2.822227266958278e-09, -6.066980144322601e+00 }, - { 2.826909212371261e-09, -6.076612325295799e+00 }, - { 2.831484726250221e-09, -6.086035139548830e+00 }, - { 2.835953636329660e-09, -6.095247785550617e+00 }, - { 2.840315774357203e-09, -6.104249468967751e+00 }, - { 2.844570976102082e-09, -6.113039402715685e+00 }, - { 2.848719081357095e-09, -6.121616806996519e+00 }, - { 2.852759933948860e-09, -6.129980909353977e+00 }, - { 2.856693381741114e-09, -6.138130944714082e+00 }, - { 2.860519276643053e-09, -6.146066155436312e+00 }, - { 2.864237474610633e-09, -6.153785791350256e+00 }, - { 2.867847835656203e-09, -6.161289109809551e+00 }, - { 2.871350223851726e-09, -6.168575375732642e+00 }, - { 2.874744507333867e-09, -6.175643861647406e+00 }, - { 2.878030558310989e-09, -6.182493847739853e+00 }, - { 2.881208253063899e-09, -6.189124621889823e+00 }, - { 2.884277471954592e-09, -6.195535479723423e+00 }, - { 2.887238099428306e-09, -6.201725724651554e+00 }, - { 2.890090024020323e-09, -6.207694667918394e+00 }, - { 2.892833138356060e-09, -6.213441628635915e+00 }, - { 2.895467339159240e-09, -6.218965933835304e+00 }, - { 2.897992527253659e-09, -6.224266918505075e+00 }, - { 2.900408607567016e-09, -6.229343925633495e+00 }, - { 2.902715489136496e-09, -6.234196306254763e+00 }, - { 2.904913085108075e-09, -6.238823419482017e+00 }, - { 2.907001312743911e-09, -6.243224632557377e+00 }, - { 2.908980093422997e-09, -6.247399320887848e+00 }, - { 2.910849352646620e-09, -6.251346868091392e+00 }, - { 2.912609020036956e-09, -6.255066666028537e+00 }, - { 2.914259029343965e-09, -6.258558114851525e+00 }, - { 2.915799318445710e-09, -6.261820623039620e+00 }, - { 2.917229829350759e-09, -6.264853607438842e+00 }, - { 2.918550508202463e-09, -6.267656493305673e+00 }, - { 2.919761305276718e-09, -6.270228714337005e+00 }, - { 2.920862174988150e-09, -6.272569712717951e+00 }, - { 2.921853075889193e-09, -6.274678939154603e+00 }, - { 2.922733970674264e-09, -6.276555852917634e+00 }, - { 2.923504826176907e-09, -6.278199921870962e+00 }, - { 2.924165613375264e-09, -6.279610622518139e+00 }, - { 2.924716307391075e-09, -6.280787440034993e+00 }, - { 2.925156887490598e-09, -6.281729868306345e+00 }, - { 2.925487337087508e-09, -6.282437409966992e+00 }, - { 2.925707643739298e-09, -6.282909576428774e+00 }, - { 2.925817799151970e-09, -6.283145887925411e+00 }, + {2.925817799165007e-09, 7.219194364267018e-09}, + {2.925707643778599e-09, 2.526699001579799e-07}, + {2.925487337153070e-09, 1.191140162167675e-06}, + {2.925156887582842e-09, 3.284585035595589e-06}, + {2.924716307509151e-09, 6.994872605695784e-06}, + {2.924165613519592e-09, 1.278374920658798e-05}, + {2.923504826347475e-09, 2.111280464718590e-05}, + {2.922733970871080e-09, 3.244343744537165e-05}, + {2.921853076112655e-09, 4.723682007436170e-05}, + {2.920862175237416e-09, 6.595386421935634e-05}, + {2.919761305552202e-09, 8.905518605213658e-05}, + {2.918550508504146e-09, 1.170010715193098e-04}, + {2.917229829679050e-09, 1.502514416517192e-04}, + {2.915799318799769e-09, 1.892658178912071e-04}, + {2.914259029724184e-09, 2.345032874456615e-04}, + {2.912609020443340e-09, 2.864224686607020e-04}, + {2.910849353079123e-09, 3.454814764261432e-04}, + {2.908980093882049e-09, 4.121378876027343e-04}, + {2.907001313228646e-09, 4.868487064877691e-04}, + {2.904913085618902e-09, 5.700703303049837e-04}, + {2.902715489673383e-09, 6.622585147355725e-04}, + {2.900408608130373e-09, 7.638683394782519e-04}, + {2.897992527842612e-09, 8.753541738578119e-04}, + {2.895467339774186e-09, 9.971696424604937e-04}, + {2.892833138996999e-09, 1.129767590823255e-03}, + {2.890090024687216e-09, 1.273600051161478e-03}, + {2.887238100121550e-09, 1.429118208142094e-03}, + {2.884277472673313e-09, 1.596772364709564e-03}, + {2.881208253808507e-09, 1.777011907950626e-03}, + {2.878030559081432e-09, 1.970285275029487e-03}, + {2.874744508130554e-09, 2.177039919152579e-03}, + {2.871350224673798e-09, 2.397722275614272e-03}, + {2.867847836504030e-09, 2.632777727878843e-03}, + {2.864237475484149e-09, 2.882650573737405e-03}, + {2.860519277542297e-09, 3.147783991507308e-03}, + {2.856693382666432e-09, 3.428620006328931e-03}, + {2.852759934899389e-09, 3.725599456482154e-03}, + {2.848719082333207e-09, 4.039161959812243e-03}, + {2.844570977103752e-09, 4.369745880190706e-03}, + {2.840315775384800e-09, 4.717788294077374e-03}, + {2.835953637382310e-09, 5.083724957128360e-03}, + {2.831484727328322e-09, 5.467990270896617e-03}, + {2.826909213474759e-09, 5.871017249604038e-03}, + {2.822227268087134e-09, 6.293237486988512e-03}, + {2.817439067438018e-09, 6.735081123237729e-03}, + {2.812544791800534e-09, 7.196976811989608e-03}, + {2.807544625441273e-09, 7.679351687456759e-03}, + {2.802438756613836e-09, 8.182631331563162e-03}, + {2.797227377551135e-09, 8.707239741274575e-03}, + {2.791910684458716e-09, 9.253599295902304e-03}, + {2.786488877507140e-09, 9.822130724578715e-03}, + {2.780962160824228e-09, 1.041325307382490e-02}, + {2.775330742487884e-09, 1.102738367513773e-02}, + {2.769594834517682e-09, 1.166493811278924e-02}, + {2.763754652867477e-09, 1.232633019159818e-02}, + {2.757810417416620e-09, 1.301197190494069e-02}, + {2.751762351962413e-09, 1.372227340270610e-02}, + {2.745610684210923e-09, 1.445764295952962e-02}, + {2.739355645769094e-09, 1.521848694296229e-02}, + {2.732997472135539e-09, 1.600520978188769e-02}, + {2.726536402691907e-09, 1.681821393496225e-02}, + {2.719972680693777e-09, 1.765789985920713e-02}, + {2.713306553261610e-09, 1.852466597868779e-02}, + {2.706538271371373e-09, 1.941890865333146e-02}, + {2.699668089844909e-09, 2.034102214787814e-02}, + {2.692696267340880e-09, 2.129139860085272e-02}, + {2.685623066344263e-09, 2.227042799383416e-02}, + {2.678448753157212e-09, 2.327849812064098e-02}, + {2.671173597888530e-09, 2.431599455681316e-02}, + {2.663797874443630e-09, 2.538330062913108e-02}, + {2.656321860514457e-09, 2.648079738524795e-02}, + {2.648745837568575e-09, 2.760886356354952e-02}, + {2.641070090839117e-09, 2.876787556300114e-02}, + {2.633294909313421e-09, 2.995820741329835e-02}, + {2.625420585722845e-09, 3.118023074495535e-02}, + {2.617447416531143e-09, 3.243431475972608e-02}, + {2.609375701923643e-09, 3.372082620101990e-02}, + {2.601205745795833e-09, 3.504012932452527e-02}, + {2.592937855741933e-09, 3.639258586895711e-02}, + {2.584572343043400e-09, 3.777855502693250e-02}, + {2.576109522656942e-09, 3.919839341605197e-02}, + {2.567549713203028e-09, 4.065245505002102e-02}, + {2.558893236953688e-09, 4.214109131001403e-02}, + {2.550140419820252e-09, 4.366465091617666e-02}, + {2.541291591341445e-09, 4.522347989919473e-02}, + {2.532347084670572e-09, 4.681792157215026e-02}, + {2.523307236563343e-09, 4.844831650239501e-02}, + {2.514172387364900e-09, 5.011500248369893e-02}, + {2.504942880997064e-09, 5.181831450849345e-02}, + {2.495619064945627e-09, 5.355858474024022e-02}, + {2.486201290246928e-09, 5.533614248606705e-02}, + {2.476689911475047e-09, 5.715131416942842e-02}, + {2.467085286727668e-09, 5.900442330315692e-02}, + {2.457387777613798e-09, 6.089579046229943e-02}, + {2.447597749239101e-09, 6.282573325755320e-02}, + {2.437715570192557e-09, 6.479456630859221e-02}, + {2.427741612532542e-09, 6.680260121764925e-02}, + {2.417676251773166e-09, 6.885014654319160e-02}, + {2.407519866869294e-09, 7.093750777401114e-02}, + {2.397272840203310e-09, 7.306498730310884e-02}, + {2.386935557569868e-09, 7.523288440214027e-02}, + {2.376508408161815e-09, 7.744149519577415e-02}, + {2.365991784555363e-09, 7.969111263635709e-02}, + {2.355386082695641e-09, 8.198202647865405e-02}, + {2.344691701881232e-09, 8.431452325495814e-02}, + {2.333909044749407e-09, 8.668888625021409e-02}, + {2.323038517261246e-09, 8.910539547731611e-02}, + {2.312080528685971e-09, 9.156432765274414e-02}, + {2.301035491585642e-09, 9.406595617227698e-02}, + {2.289903821799651e-09, 9.661055108691619e-02}, + {2.278685938428940e-09, 9.919837907903295e-02}, + {2.267382263820762e-09, 1.018297034385580e-01}, + {2.255993223551837e-09, 1.045047840397028e-01}, + {2.244519246413220e-09, 1.072238773174577e-01}, + {2.232960764393620e-09, 1.099872362446146e-01}, + {2.221318212663309e-09, 1.127951103088245e-01}, + {2.209592029557811e-09, 1.156477454898748e-01}, + {2.197782656561395e-09, 1.185453842371912e-01}, + {2.185890538290176e-09, 1.214882654476019e-01}, + {2.173916122475606e-09, 1.244766244431883e-01}, + {2.161859859947797e-09, 1.275106929493488e-01}, + {2.149722204618256e-09, 1.305906990731841e-01}, + {2.137503613462743e-09, 1.337168672820376e-01}, + {2.125204546504321e-09, 1.368894183821595e-01}, + {2.112825466795944e-09, 1.401085694976751e-01}, + {2.100366840402933e-09, 1.433745340497602e-01}, + {2.087829136385612e-09, 1.466875217359607e-01}, + {2.075212826781308e-09, 1.500477385098620e-01}, + {2.062518386587093e-09, 1.534553865607503e-01}, + {2.049746293741359e-09, 1.569106642937665e-01}, + {2.036897029106193e-09, 1.604137663100403e-01}, + {2.023971076449323e-09, 1.639648833871233e-01}, + {2.010968922425217e-09, 1.675642024598467e-01}, + {1.997891056557933e-09, 1.712119066008896e-01}, + {1.984737971221581e-09, 1.749081750021970e-01}, + {1.971510161622434e-09, 1.786531829561379e-01}, + {1.958208125780130e-09, 1.824471018371070e-01}, + {1.944832364508511e-09, 1.862900990834311e-01}, + {1.931383381397782e-09, 1.901823381790926e-01}, + {1.917861682794392e-09, 1.941239786363039e-01}, + {1.904267777782611e-09, 1.981151759777950e-01}, + {1.890602178165317e-09, 2.021560817195309e-01}, + {1.876865398444616e-09, 2.062468433536743e-01}, + {1.863057955802572e-09, 2.103876043317229e-01}, + {1.849180370081465e-09, 2.145785040479915e-01}, + {1.835233163764673e-09, 2.188196778231083e-01}, + {1.821216861956509e-09, 2.231112568880342e-01}, + {1.807131992362945e-09, 2.274533683680190e-01}, + {1.792979085271234e-09, 2.318461352671018e-01}, + {1.778758673530482e-09, 2.362896764525300e-01}, + {1.764471292530943e-09, 2.407841066397789e-01}, + {1.750117480184598e-09, 2.453295363773890e-01}, + {1.735697776904342e-09, 2.499260720324433e-01}, + {1.721212725583874e-09, 2.545738157760434e-01}, + {1.706662871577097e-09, 2.592728655691494e-01}, + {1.692048762677849e-09, 2.640233151485341e-01}, + {1.677370949099090e-09, 2.688252540131204e-01}, + {1.662629983452104e-09, 2.736787674105404e-01}, + {1.647826420726167e-09, 2.785839363237506e-01}, + {1.632960818266680e-09, 2.835408374583758e-01}, + {1.618033735755429e-09, 2.885495432295704e-01}, + {1.603045735188609e-09, 2.936101217498361e-01}, + {1.587997380855918e-09, 2.987226368167127e-01}, + {1.572889239319430e-09, 3.038871479007593e-01}, + {1.557721879392051e-09, 3.091037101339017e-01}, + {1.542495872116447e-09, 3.143723742978435e-01}, + {1.527211790743024e-09, 3.196931868130269e-01}, + {1.511870210708909e-09, 3.250661897274744e-01}, + {1.496471709615926e-09, 3.304914207062036e-01}, + {1.481016867208896e-09, 3.359689130207621e-01}, + {1.465506265353924e-09, 3.414986955389885e-01}, + {1.449940488016384e-09, 3.470807927151147e-01}, + {1.434320121238994e-09, 3.527152245800635e-01}, + {1.418645753119802e-09, 3.584020067320109e-01}, + {1.402917973789838e-09, 3.641411503272979e-01}, + {1.387137375391042e-09, 3.699326620714776e-01}, + {1.371304552054134e-09, 3.757765442106153e-01}, + {1.355420099875958e-09, 3.816727945230153e-01}, + {1.339484616897137e-09, 3.876214063110671e-01}, + {1.323498703079580e-09, 3.936223683933865e-01}, + {1.307462960283922e-09, 3.996756650972121e-01}, + {1.291377992246768e-09, 4.057812762511174e-01}, + {1.275244404558188e-09, 4.119391771778626e-01}, + {1.259062804638585e-09, 4.181493386877248e-01}, + {1.242833801715929e-09, 4.244117270719281e-01}, + {1.226558006803155e-09, 4.307263040962509e-01}, + {1.210236032674760e-09, 4.370930269951803e-01}, + {1.193868493843725e-09, 4.435118484661861e-01}, + {1.177456006538695e-09, 4.499827166641340e-01}, + {1.160999188680582e-09, 4.565055751961679e-01}, + {1.144498659859216e-09, 4.630803631168164e-01}, + {1.127955041310214e-09, 4.697070149232604e-01}, + {1.111368955891417e-09, 4.763854605510119e-01}, + {1.094741028059551e-09, 4.831156253697562e-01}, + {1.078071883846871e-09, 4.898974301794375e-01}, + {1.061362150836978e-09, 4.967307912069362e-01}, + {1.044612458142151e-09, 5.036156201023686e-01}, + {1.027823436378632e-09, 5.105518239364775e-01}, + {1.010995717643647e-09, 5.175393051975563e-01}, + {9.941299354913699e-10, 5.245779617890562e-01}, + {9.772267249089968e-10, 5.316676870274011e-01}, + {9.602867222926046e-10, 5.388083696401416e-01}, + {9.433105654240147e-10, 5.459998937639375e-01}, + {9.262988934458084e-10, 5.532421389435711e-01}, + {9.092523468378193e-10, 5.605349801305876e-01}, + {8.921715673928355e-10, 5.678782876825250e-01}, + {8.750571981926701e-10, 5.752719273622372e-01}, + {8.579098835836508e-10, 5.827157603377209e-01}, + {8.407302691522673e-10, 5.902096431821322e-01}, + {8.235190017016133e-10, 5.977534278737073e-01}, + {8.062767292259225e-10, 6.053469617967722e-01}, + {7.890041008871165e-10, 6.129900877421282e-01}, + {7.717017669898175e-10, 6.206826439083659e-01}, + {7.543703789572603e-10, 6.284244639030392e-01}, + {7.370105893063053e-10, 6.362153767444958e-01}, + {7.196230516231919e-10, 6.440552068636356e-01}, + {7.022084205389746e-10, 6.519437741060674e-01}, + {6.847673517046416e-10, 6.598808937346672e-01}, + {6.673005017664976e-10, 6.678663764322770e-01}, + {6.498085283416530e-10, 6.759000283046127e-01}, + {6.322920899929834e-10, 6.839816508836737e-01}, + {6.147518462045659e-10, 6.921110411311926e-01}, + {5.971884573565851e-10, 7.002879914425926e-01}, + {5.796025847007168e-10, 7.085122896509806e-01}, + {5.619948903351406e-10, 7.167837190315758e-01}, + {5.443660371796048e-10, 7.251020583063744e-01}, + {5.267166889504394e-10, 7.334670816491009e-01}, + {5.090475101356742e-10, 7.418785586903696e-01}, + {4.913591659698399e-10, 7.503362545232619e-01}, + {4.736523224091392e-10, 7.588399297089872e-01}, + {4.559276461062478e-10, 7.673893402829834e-01}, + {4.381858043851147e-10, 7.759842377612828e-01}, + {4.204274652161870e-10, 7.846243691469355e-01}, + {4.026532971908398e-10, 7.933094769370790e-01}, + {3.848639694963359e-10, 8.020392991300200e-01}, + {3.670601518910503e-10, 8.108135692324444e-01}, + {3.492425146784233e-10, 8.196320162675177e-01}, + {3.314117286825031e-10, 8.284943647824689e-01}, + {3.135684652223755e-10, 8.374003348569865e-01}, + {2.957133960867535e-10, 8.463496421118015e-01}, + {2.778471935089361e-10, 8.553419977173513e-01}, + {2.599705301412391e-10, 8.643771084029740e-01}, + {2.420840790301135e-10, 8.734546764660205e-01}, + {2.241885135902046e-10, 8.825743997817682e-01}, + {2.062845075795238e-10, 8.917359718130367e-01}, + {1.883727350736140e-10, 9.009390816205823e-01}, + {1.704538704408269e-10, 9.101834138731877e-01}, + {1.525285883160648e-10, 9.194686488588080e-01}, + {1.345975635762696e-10, 9.287944624950824e-01}, + {1.166614713141648e-10, 9.381605263410157e-01}, + {9.872098681369190e-11, 9.475665076080466e-01}, + {8.077678552380464e-11, 9.570120691722380e-01}, + {6.282954303364090e-11, 9.664968695860140e-01}, + {4.487993504668797e-11, 9.760205630906909e-01}, + {2.692863735553042e-11, 9.855827996289697e-01}, + {8.976325816439114e-12, 9.951832248577780e-01}, + {-8.976323676304494e-12, 1.004821480161519e+00}, + {-2.692863521550168e-11, 1.014497202665280e+00}, + {-4.487993290681805e-11, 1.024210025248670e+00}, + {-6.282954089398273e-11, 1.033959576559617e+00}, + {-8.077678338451706e-11, 1.043745481028715e+00}, + {-9.872098467477489e-11, 1.053567358883467e+00}, + {-1.166614691757772e-10, 1.063424826163223e+00}, + {-1.345975614383584e-10, 1.073317494734013e+00}, + {-1.525285861788948e-10, 1.083244972303963e+00}, + {-1.704538683042922e-10, 1.093206862438572e+00}, + {-1.883727329379793e-10, 1.103202764576806e+00}, + {-2.062845054446831e-10, 1.113232274046796e+00}, + {-2.241885114563697e-10, 1.123294982082432e+00}, + {-2.420840768973375e-10, 1.133390475839767e+00}, + {-2.599705280096278e-10, 1.143518338413855e+00}, + {-2.778471913784365e-10, 1.153678148855860e+00}, + {-2.957133939575774e-10, 1.163869482190458e+00}, + {-3.135684630945758e-10, 1.174091909433296e+00}, + {-3.314117265561857e-10, 1.184344997608959e+00}, + {-3.492425125535882e-10, 1.194628309769018e+00}, + {-3.670601497678034e-10, 1.204941405010466e+00}, + {-3.848639673748360e-10, 1.215283838494269e+00}, + {-4.026532950710339e-10, 1.225655161464298e+00}, + {-4.204274630982869e-10, 1.236054921266445e+00}, + {-4.381858022691734e-10, 1.246482661367958e+00}, + {-4.559276439922654e-10, 1.256937921377146e+00}, + {-4.736523202972214e-10, 1.267420237063216e+00}, + {-4.913591638600925e-10, 1.277929140376502e+00}, + {-5.090475080282032e-10, 1.288464159468706e+00}, + {-5.267166868452449e-10, 1.299024818713528e+00}, + {-5.443660350768455e-10, 1.309610638727845e+00}, + {-5.619948882348695e-10, 1.320221136392390e+00}, + {-5.796025826029868e-10, 1.330855824873457e+00}, + {-5.971884552615020e-10, 1.341514213644420e+00}, + {-6.147518441122357e-10, 1.352195808507556e+00}, + {-6.322920879034590e-10, 1.362900111616144e+00}, + {-6.498085262549874e-10, 1.373626621496939e+00}, + {-6.673004996827436e-10, 1.384374833072571e+00}, + {-6.847673496239581e-10, 1.395144237684605e+00}, + {-7.022084184613616e-10, 1.405934323116231e+00}, + {-7.196230495488082e-10, 1.416744573616104e+00}, + {-7.370105872352039e-10, 1.427574469921397e+00}, + {-7.543703768894941e-10, 1.438423489281758e+00}, + {-7.717017649255453e-10, 1.449291105483472e+00}, + {-7.890040988262324e-10, 1.460176788873383e+00}, + {-8.062767271686383e-10, 1.471080006383765e+00}, + {-8.235189996479819e-10, 1.482000221556656e+00}, + {-8.407302671024475e-10, 1.492936894569018e+00}, + {-8.579098815375368e-10, 1.503889482257845e+00}, + {-8.750571961505266e-10, 1.514857438145604e+00}, + {-8.921715653546624e-10, 1.525840212465756e+00}, + {-9.092523448036167e-10, 1.536837252188703e+00}, + {-9.262988914157881e-10, 1.547848001047890e+00}, + {-9.433105633981766e-10, 1.558871899565883e+00}, + {-9.602867202711075e-10, 1.569908385081254e+00}, + {-9.772267228916820e-10, 1.580956891774897e+00}, + {-9.941299334786078e-10, 1.592016850697478e+00}, + {-1.010995715635332e-09, 1.603087689796053e+00}, + {-1.027823434374870e-09, 1.614168833942028e+00}, + {-1.044612456143047e-09, 1.625259704958335e+00}, + {-1.061362148842745e-09, 1.636359721647526e+00}, + {-1.078071881857297e-09, 1.647468299819543e+00}, + {-1.094741026074900e-09, 1.658584852320419e+00}, + {-1.111368953911690e-09, 1.669708789060341e+00}, + {-1.127955039335462e-09, 1.680839517042381e+00}, + {-1.144498657889600e-09, 1.691976440391624e+00}, + {-1.160999186716154e-09, 1.703118960383971e+00}, + {-1.177456004579561e-09, 1.714266475475616e+00}, + {-1.193868491889832e-09, 1.725418381332405e+00}, + {-1.210236030726319e-09, 1.736574070859850e+00}, + {-1.226558004860220e-09, 1.747732934232508e+00}, + {-1.242833799778447e-09, 1.758894358924547e+00}, + {-1.259062802706714e-09, 1.770057729740021e+00}, + {-1.275244402631982e-09, 1.781222428842935e+00}, + {-1.291377990326492e-09, 1.792387835788660e+00}, + {-1.307462958369363e-09, 1.803553327553897e+00}, + {-1.323498701170897e-09, 1.814718278568759e+00}, + {-1.339484614994490e-09, 1.825882060747428e+00}, + {-1.355420097979292e-09, 1.837044043519582e+00}, + {-1.371304550163662e-09, 1.848203593862598e+00}, + {-1.387137373506711e-09, 1.859360076332671e+00}, + {-1.402917971911754e-09, 1.870512853097495e+00}, + {-1.418645751248018e-09, 1.881661283967967e+00}, + {-1.434320119373722e-09, 1.892804726431080e+00}, + {-1.449940486157623e-09, 1.903942535681972e+00}, + {-1.465506263501516e-09, 1.915074064656886e+00}, + {-1.481016865363264e-09, 1.926198664066737e+00}, + {-1.496471707776859e-09, 1.937315682428795e+00}, + {-1.511870208876724e-09, 1.948424466101625e+00}, + {-1.527211788917509e-09, 1.959524359317042e+00}, + {-1.542495870297867e-09, 1.970614704215133e+00}, + {-1.557721877580406e-09, 1.981694840876775e+00}, + {-1.572889237514880e-09, 1.992764107358707e+00}, + {-1.587997379058514e-09, 2.003821839726753e+00}, + {-1.603045733398246e-09, 2.014867372090665e+00}, + {-1.618033733972424e-09, 2.025900036638798e+00}, + {-1.632960816490822e-09, 2.036919163671778e+00}, + {-1.647826418957721e-09, 2.047924081638631e+00}, + {-1.662629981691070e-09, 2.058914117170269e+00}, + {-1.677370947345626e-09, 2.069888595116115e+00}, + {-1.692048760931849e-09, 2.080846838577820e+00}, + {-1.706662869838827e-09, 2.091788168946183e+00}, + {-1.721212723853279e-09, 2.102711905935372e+00}, + {-1.735697775181424e-09, 2.113617367619504e+00}, + {-1.750117478469621e-09, 2.124503870468520e+00}, + {-1.764471290823748e-09, 2.135370729383332e+00}, + {-1.778758671831281e-09, 2.146217257733207e+00}, + {-1.792979083579974e-09, 2.157042767390815e+00}, + {-1.807131990679890e-09, 2.167846568770014e+00}, + {-1.821216860281448e-09, 2.178627970860822e+00}, + {-1.835233162097977e-09, 2.189386281268046e+00}, + {-1.849180368423027e-09, 2.200120806246095e+00}, + {-1.863057954152340e-09, 2.210830850737588e+00}, + {-1.876865396802907e-09, 2.221515718409926e+00}, + {-1.890602176531920e-09, 2.232174711691990e+00}, + {-1.904267776157843e-09, 2.242807131812679e+00}, + {-1.917861681178094e-09, 2.253412278837029e+00}, + {-1.931383379790273e-09, 2.263989451705295e+00}, + {-1.944832362909578e-09, 2.274537948269257e+00}, + {-1.958208124189984e-09, 2.285057065331676e+00}, + {-1.971510160041235e-09, 2.295546098682665e+00}, + {-1.984737969649064e-09, 2.306004343138794e+00}, + {-1.997891054994522e-09, 2.316431092581699e+00}, + {-2.010968920870647e-09, 2.326825639994779e+00}, + {-2.023971074903858e-09, 2.337187277503834e+00}, + {-2.036897027569834e-09, 2.347515296413520e+00}, + {-2.049746292214264e-09, 2.357808987247877e+00}, + {-2.062518385069210e-09, 2.368067639787542e+00}, + {-2.075212825272584e-09, 2.378290543109652e+00}, + {-2.087829134886364e-09, 2.388476985626922e+00}, + {-2.100366838912949e-09, 2.398626255125417e+00}, + {-2.112825465315542e-09, 2.408737638805759e+00}, + {-2.125204545033289e-09, 2.418810423320288e+00}, + {-2.137503612001452e-09, 2.428843894814472e+00}, + {-2.149722203166389e-09, 2.438837338964302e+00}, + {-2.161859858505829e-09, 2.448790041018174e+00}, + {-2.173916121043380e-09, 2.458701285834241e+00}, + {-2.185890536867478e-09, 2.468570357921585e+00}, + {-2.197782655148702e-09, 2.478396541480230e+00}, + {-2.209592028154913e-09, 2.488179120439544e+00}, + {-2.221318211270522e-09, 2.497917378500214e+00}, + {-2.232960763010574e-09, 2.507610599172123e+00}, + {-2.244519245040444e-09, 2.517258065817044e+00}, + {-2.255993222189014e-09, 2.526859061686102e+00}, + {-2.267382262468209e-09, 2.536412869962689e+00}, + {-2.278685937086658e-09, 2.545918773800664e+00}, + {-2.289903820467374e-09, 2.555376056366064e+00}, + {-2.301035490263848e-09, 2.564784000877677e+00}, + {-2.312080527374447e-09, 2.574141890646339e+00}, + {-2.323038515960257e-09, 2.583449009117307e+00}, + {-2.333909043458635e-09, 2.592704639909166e+00}, + {-2.344691700601153e-09, 2.601908066856634e+00}, + {-2.355386081425938e-09, 2.611058574048749e+00}, + {-2.365991783296513e-09, 2.620155445872768e+00}, + {-2.376508406913500e-09, 2.629197967052127e+00}, + {-2.386935556332088e-09, 2.638185422689490e+00}, + {-2.397272838976436e-09, 2.647117098307332e+00}, + {-2.407519865653114e-09, 2.655992279887846e+00}, + {-2.417676250567891e-09, 2.664810253915885e+00}, + {-2.427741611338014e-09, 2.673570307418169e+00}, + {-2.437715569009093e-09, 2.682271728006635e+00}, + {-2.447597748066437e-09, 2.690913803917100e+00}, + {-2.457387776452357e-09, 2.699495824053297e+00}, + {-2.467085285577292e-09, 2.708017078025636e+00}, + {-2.476689910335470e-09, 2.716476856194105e+00}, + {-2.486201289118733e-09, 2.724874449709689e+00}, + {-2.495619063828443e-09, 2.733209150554255e+00}, + {-2.504942879891263e-09, 2.741480251583985e+00}, + {-2.514172386270163e-09, 2.749687046568741e+00}, + {-2.523307235480146e-09, 2.757828830235740e+00}, + {-2.532347083598520e-09, 2.765904898308531e+00}, + {-2.541291590280960e-09, 2.773914547551261e+00}, + {-2.550140418771202e-09, 2.781857075807392e+00}, + {-2.558893235915887e-09, 2.789731782043156e+00}, + {-2.567549712176927e-09, 2.797537966388929e+00}, + {-2.576109521642196e-09, 2.805274930179221e+00}, + {-2.584572342040407e-09, 2.812941975996573e+00}, + {-2.592937854750428e-09, 2.820538407710556e+00}, + {-2.601205744816134e-09, 2.828063530521908e+00}, + {-2.609375700955458e-09, 2.835516651001539e+00}, + {-2.617447415574869e-09, 2.842897077134583e+00}, + {-2.625420584778350e-09, 2.850204118359573e+00}, + {-2.633294908380520e-09, 2.857437085611509e+00}, + {-2.641070089918234e-09, 2.864595291363663e+00}, + {-2.648745836659391e-09, 2.871678049666939e+00}, + {-2.656321859617343e-09, 2.878684676194483e+00}, + {-2.663797873558322e-09, 2.885614488280000e+00}, + {-2.671173597015318e-09, 2.892466804962122e+00}, + {-2.678448752295859e-09, 2.899240947023252e+00}, + {-2.685623065495139e-09, 2.905936237033475e+00}, + {-2.692696266503800e-09, 2.912551999389617e+00}, + {-2.699668089019767e-09, 2.919087560358171e+00}, + {-2.706538270558513e-09, 2.925542248116882e+00}, + {-2.713306552460767e-09, 2.931915392794031e+00}, + {-2.719972679905295e-09, 2.938206326512581e+00}, + {-2.726536401915442e-09, 2.944414383428562e+00}, + {-2.732997471371516e-09, 2.950538899775061e+00}, + {-2.739355645017194e-09, 2.956579213900666e+00}, + {-2.745610683471516e-09, 2.962534666313284e+00}, + {-2.751762351235315e-09, 2.968404599718795e+00}, + {-2.757810416701751e-09, 2.974188359063684e+00}, + {-2.763754652165128e-09, 2.979885291576143e+00}, + {-2.769594833827588e-09, 2.985494746805227e+00}, + {-2.775330741810390e-09, 2.991016076664491e+00}, + {-2.780962160159068e-09, 2.996448635469842e+00}, + {-2.786488876854607e-09, 3.001791779983262e+00}, + {-2.791910683818570e-09, 3.007044869450794e+00}, + {-2.797227376923695e-09, 3.012207265645876e+00}, + {-2.802438755998943e-09, 3.017278332907412e+00}, + {-2.807544624838820e-09, 3.022257438182037e+00}, + {-2.812544791210840e-09, 3.027143951064684e+00}, + {-2.817439066860792e-09, 3.031937243837070e+00}, + {-2.822227267522746e-09, 3.036636691510884e+00}, + {-2.826909212922864e-09, 3.041241671864994e+00}, + {-2.831484726789317e-09, 3.045751565488710e+00}, + {-2.835953636855826e-09, 3.050165755818853e+00}, + {-2.840315774871260e-09, 3.054483629182857e+00}, + {-2.844570976602957e-09, 3.058704574835744e+00}, + {-2.848719081844986e-09, 3.062827985002047e+00}, + {-2.852759934424164e-09, 3.066853254915581e+00}, + {-2.856693382203833e-09, 3.070779782857041e+00}, + {-2.860519277092708e-09, 3.074606970196721e+00}, + {-2.864237475047239e-09, 3.078334221430809e+00}, + {-2.867847836080156e-09, 3.081960944223928e+00}, + {-2.871350224262603e-09, 3.085486549445314e+00}, + {-2.874744507732462e-09, 3.088910451211251e+00}, + {-2.878030558696270e-09, 3.092232066921130e+00}, + {-2.881208253436038e-09, 3.095450817298478e+00}, + {-2.884277472313999e-09, 3.098566126429974e+00}, + {-2.887238099774968e-09, 3.101577421802070e+00}, + {-2.890090024353816e-09, 3.104484134342861e+00}, + {-2.892833138676371e-09, 3.107285698457308e+00}, + {-2.895467339466766e-09, 3.109981552069083e+00}, + {-2.897992527547963e-09, 3.112571136655481e+00}, + {-2.900408607848946e-09, 3.115053897289195e+00}, + {-2.902715489404992e-09, 3.117429282673042e+00}, + {-2.904913085363323e-09, 3.119696745180238e+00}, + {-2.907001312986328e-09, 3.121855740892224e+00}, + {-2.908980093652563e-09, 3.123905729634218e+00}, + {-2.910849352862924e-09, 3.125846175016163e+00}, + {-2.912609020239985e-09, 3.127676544466606e+00}, + {-2.914259029534118e-09, 3.129396309273659e+00}, + {-2.915799318622574e-09, 3.131004944618667e+00}, + {-2.917229829515169e-09, 3.132501929616775e+00}, + {-2.918550508353347e-09, 3.133886747350606e+00}, + {-2.919761305414294e-09, 3.135158884909254e+00}, + {-2.920862175112829e-09, 3.136317833424958e+00}, + {-2.921853076000972e-09, 3.137363088107359e+00}, + {-2.922733970772719e-09, 3.138294148283254e+00}, + {-2.923504826262027e-09, 3.139110517429204e+00}, + {-2.924165613447473e-09, 3.139811703211207e+00}, + {-2.924716307449950e-09, 3.140397217517018e+00}, + {-2.925156887536978e-09, 3.140866576495489e+00}, + {-2.925487337120335e-09, 3.141219300588825e+00}, + {-2.925707643758784e-09, 3.141454914570261e+00}, + {-2.925817799158535e-09, 3.141572947579352e+00}, + {-2.925817799171455e-09, 3.141572933154836e+00}, + {-2.925707643798390e-09, 3.141454409272987e+00}, + {-2.925487337185779e-09, 3.141216918378770e+00}, + {-2.925156887628892e-09, 3.140860007424112e+00}, + {-2.924716307568119e-09, 3.140383227898687e+00}, + {-2.924165613591896e-09, 3.139786135867868e+00}, + {-2.923504826432903e-09, 3.139068292003385e+00}, + {-2.922733970969412e-09, 3.138229261619561e+00}, + {-2.921853076224321e-09, 3.137268614707029e+00}, + {-2.920862175361976e-09, 3.136185925964038e+00}, + {-2.919761305690083e-09, 3.134980774833275e+00}, + {-2.918550508654911e-09, 3.133652745531368e+00}, + {-2.917229829843137e-09, 3.132201427085629e+00}, + {-2.915799318976726e-09, 3.130626413363146e+00}, + {-2.914259029914435e-09, 3.128927303107136e+00}, + {-2.912609020646661e-09, 3.127103699965947e+00}, + {-2.910849353295315e-09, 3.125155212527586e+00}, + {-2.908980094111509e-09, 3.123081454351802e+00}, + {-2.907001313470937e-09, 3.120882043999591e+00}, + {-2.904913085874448e-09, 3.118556605068443e+00}, + {-2.902715489941767e-09, 3.116104766219928e+00}, + {-2.900408608411958e-09, 3.113526161214776e+00}, + {-2.897992528137022e-09, 3.110820428940251e+00}, + {-2.895467340081818e-09, 3.107987213444579e+00}, + {-2.892833139317615e-09, 3.105026163964191e+00}, + {-2.890090025020589e-09, 3.101936934956479e+00}, + {-2.887238100468092e-09, 3.098719186130021e+00}, + {-2.884277473032614e-09, 3.095372582472161e+00}, + {-2.881208254180937e-09, 3.091896794282404e+00}, + {-2.878030559466594e-09, 3.088291497198199e+00}, + {-2.874744508528832e-09, 3.084556372228054e+00}, + {-2.871350225084755e-09, 3.080691105776848e+00}, + {-2.867847836928063e-09, 3.076695389678615e+00}, + {-2.864237475921086e-09, 3.072568921221621e+00}, + {-2.860519277991847e-09, 3.068311403179147e+00}, + {-2.856693383129018e-09, 3.063922543837792e+00}, + {-2.852759935374575e-09, 3.059402057023109e+00}, + {-2.848719082821403e-09, 3.054749662130841e+00}, + {-2.844570977604520e-09, 3.049965084150782e+00}, + {-2.840315775898525e-09, 3.045048053697736e+00}, + {-2.835953637908582e-09, 3.039998307034967e+00}, + {-2.831484727867511e-09, 3.034815586104635e+00}, + {-2.826909214026628e-09, 3.029499638550941e+00}, + {-2.822227268651470e-09, 3.024050217748861e+00}, + {-2.817439068015245e-09, 3.018467082830179e+00}, + {-2.812544792390175e-09, 3.012749998707001e+00}, + {-2.807544626043751e-09, 3.006898736100911e+00}, + {-2.802438757228650e-09, 3.000913071564665e+00}, + {-2.797227378178760e-09, 2.994792787510961e+00}, + {-2.791910685098702e-09, 2.988537672233504e+00}, + {-2.786488878159805e-09, 2.982147519935565e+00}, + {-2.780962161489413e-09, 2.975622130750641e+00}, + {-2.775330743165298e-09, 2.968961310769028e+00}, + {-2.769594835207775e-09, 2.962164872061613e+00}, + {-2.763754653569747e-09, 2.955232632701135e+00}, + {-2.757810418131543e-09, 2.948164416789036e+00}, + {-2.751762352689432e-09, 2.940960054474719e+00}, + {-2.745610684950541e-09, 2.933619381982341e+00}, + {-2.739355646520809e-09, 2.926142241629213e+00}, + {-2.732997472899722e-09, 2.918528481852205e+00}, + {-2.726536403468318e-09, 2.910777957226018e+00}, + {-2.719972681482232e-09, 2.902890528487386e+00}, + {-2.713306554062453e-09, 2.894866062556452e+00}, + {-2.706538272184154e-09, 2.886704432555728e+00}, + {-2.699668090670078e-09, 2.878405517834426e+00}, + {-2.692696268177908e-09, 2.869969203985464e+00}, + {-2.685623067193599e-09, 2.861395382869544e+00}, + {-2.678448754018380e-09, 2.852683952631486e+00}, + {-2.671173598761847e-09, 2.843834817723832e+00}, + {-2.663797875328991e-09, 2.834847888922988e+00}, + {-2.656321861411517e-09, 2.825723083350459e+00}, + {-2.648745838477759e-09, 2.816460324492298e+00}, + {-2.641070091759922e-09, 2.807059542215146e+00}, + {-2.633294910246296e-09, 2.797520672788269e+00}, + {-2.625420586667340e-09, 2.787843658897949e+00}, + {-2.617447417487602e-09, 2.778028449668942e+00}, + {-2.609375702891616e-09, 2.768075000678399e+00}, + {-2.601205746775692e-09, 2.757983273976943e+00}, + {-2.592937856733464e-09, 2.747753238101915e+00}, + {-2.584572344046340e-09, 2.737384868096553e+00}, + {-2.576109523671634e-09, 2.726878145526201e+00}, + {-2.567549714229129e-09, 2.716233058492422e+00}, + {-2.558893237991435e-09, 2.705449601651722e+00}, + {-2.550140420869302e-09, 2.694527776227857e+00}, + {-2.541291592402089e-09, 2.683467590030445e+00}, + {-2.532347085742440e-09, 2.672269057466213e+00}, + {-2.523307237646751e-09, 2.660932199557362e+00}, + {-2.514172388459584e-09, 2.649457043952206e+00}, + {-2.504942882102813e-09, 2.637843624941622e+00}, + {-2.495619066062810e-09, 2.626091983472908e+00}, + {-2.486201291375123e-09, 2.614202167160335e+00}, + {-2.476689912614465e-09, 2.602174230302269e+00}, + {-2.467085287878098e-09, 2.590008233889805e+00}, + {-2.457387778775451e-09, 2.577704245623143e+00}, + {-2.447597750411553e-09, 2.565262339920002e+00}, + {-2.437715571376127e-09, 2.552682597931055e+00}, + {-2.427741613727123e-09, 2.539965107548168e+00}, + {-2.417676252978335e-09, 2.527109963417675e+00}, + {-2.407519868085581e-09, 2.514117266951687e+00}, + {-2.397272841430131e-09, 2.500987126335739e+00}, + {-2.386935558807595e-09, 2.487719656543254e+00}, + {-2.376508409410024e-09, 2.474314979341178e+00}, + {-2.365991785814531e-09, 2.460773223303822e+00}, + {-2.355386083965131e-09, 2.447094523817833e+00}, + {-2.344691703161363e-09, 2.433279023095734e+00}, + {-2.333909046040126e-09, 2.419326870180582e+00}, + {-2.323038518562289e-09, 2.405238220956597e+00}, + {-2.312080529997549e-09, 2.391013238157397e+00}, + {-2.301035492907384e-09, 2.376652091371587e+00}, + {-2.289903823131822e-09, 2.362154957053137e+00}, + {-2.278685939771276e-09, 2.347522018525197e+00}, + {-2.267382265173420e-09, 2.332753465990296e+00}, + {-2.255993224914501e-09, 2.317849496533128e+00}, + {-2.244519247786155e-09, 2.302810314130351e+00}, + {-2.232960765776561e-09, 2.287636129652823e+00}, + {-2.221318214056095e-09, 2.272327160873552e+00}, + {-2.209592030960763e-09, 2.256883632472565e+00}, + {-2.197782657974034e-09, 2.241305776039511e+00}, + {-2.185890539712767e-09, 2.225593830081461e+00}, + {-2.173916123907886e-09, 2.209748040023618e+00}, + {-2.161859861389976e-09, 2.193768658216360e+00}, + {-2.149722206070124e-09, 2.177655943935795e+00}, + {-2.137503614923981e-09, 2.161410163388424e+00}, + {-2.125204547975352e-09, 2.145031589714984e+00}, + {-2.112825468276292e-09, 2.128520502989477e+00}, + {-2.100366841892917e-09, 2.111877190225612e+00}, + {-2.087829137884807e-09, 2.095101945374541e+00}, + {-2.075212828290086e-09, 2.078195069329960e+00}, + {-2.062518388104923e-09, 2.061156869925600e+00}, + {-2.049746295268559e-09, 2.043987661939897e+00}, + {-2.036897030642658e-09, 2.026687767092888e+00}, + {-2.023971077994576e-09, 2.009257514048162e+00}, + {-2.010968923979840e-09, 1.991697238413571e+00}, + {-1.997891058121344e-09, 1.974007282737320e+00}, + {-1.984737972794098e-09, 1.956187996511354e+00}, + {-1.971510163203686e-09, 1.938239736166060e+00}, + {-1.958208127370276e-09, 1.920162865072273e+00}, + {-1.944832366107339e-09, 1.901957753535934e+00}, + {-1.931383383005451e-09, 1.883624778799427e+00}, + {-1.917861684410531e-09, 1.865164325035177e+00}, + {-1.904267779407432e-09, 1.846576783346324e+00}, + {-1.890602179798714e-09, 1.827862551760622e+00}, + {-1.876865400086483e-09, 1.809022035228338e+00}, + {-1.863057957452539e-09, 1.790055645617624e+00}, + {-1.849180371740008e-09, 1.770963801711725e+00}, + {-1.835233165431475e-09, 1.751746929201178e+00}, + {-1.821216863631569e-09, 1.732405460681919e+00}, + {-1.807131994045840e-09, 1.712939835648088e+00}, + {-1.792979086962494e-09, 1.693350500488565e+00}, + {-1.778758675229683e-09, 1.673637908477153e+00}, + {-1.764471294238191e-09, 1.653802519770021e+00}, + {-1.750117481899733e-09, 1.633844801396848e+00}, + {-1.735697778626995e-09, 1.613765227254186e+00}, + {-1.721212727314574e-09, 1.593564278099856e+00}, + {-1.706662873315474e-09, 1.573242441540939e+00}, + {-1.692048764423848e-09, 1.552800212030258e+00}, + {-1.677370950852395e-09, 1.532238090855187e+00}, + {-1.662629985213192e-09, 1.511556586131055e+00}, + {-1.647826422494560e-09, 1.490756212788764e+00}, + {-1.632960820042537e-09, 1.469837492568651e+00}, + {-1.618033737538645e-09, 1.448800954008929e+00}, + {-1.603045736978760e-09, 1.427647132435469e+00}, + {-1.587997382653428e-09, 1.406376569953373e+00}, + {-1.572889241124034e-09, 1.384989815432507e+00}, + {-1.557721881203696e-09, 1.363487424499449e+00}, + {-1.542495873934815e-09, 1.341869959524515e+00}, + {-1.527211792568486e-09, 1.320137989611176e+00}, + {-1.511870212541253e-09, 1.298292090581491e+00}, + {-1.496471711454994e-09, 1.276332844965754e+00}, + {-1.481016869054634e-09, 1.254260841988828e+00}, + {-1.465506267206068e-09, 1.232076677556547e+00}, + {-1.449940489875303e-09, 1.209780954243628e+00}, + {-1.434320123104372e-09, 1.187374281276747e+00}, + {-1.418645754991533e-09, 1.164857274523495e+00}, + {-1.402917975667710e-09, 1.142230556475749e+00}, + {-1.387137377275425e-09, 1.119494756236361e+00}, + {-1.371304553944712e-09, 1.096650509501278e+00}, + {-1.355420101772623e-09, 1.073698458546610e+00}, + {-1.339484618799891e-09, 1.050639252211352e+00}, + {-1.323498704988051e-09, 1.027473545880543e+00}, + {-1.307462962198534e-09, 1.004202001471034e+00}, + {-1.291377994167204e-09, 9.808252874104182e-01}, + {-1.275244406484394e-09, 9.573440786237052e-01}, + {-1.259062806570190e-09, 9.337590565128454e-01}, + {-1.242833803653464e-09, 9.100709089414796e-01}, + {-1.226558008746195e-09, 8.862803302125812e-01}, + {-1.210236034623253e-09, 8.623880210538113e-01}, + {-1.193868495797618e-09, 8.383946885959868e-01}, + {-1.177456008497777e-09, 8.143010463544786e-01}, + {-1.160999190645010e-09, 7.901078142102129e-01}, + {-1.144498661828833e-09, 7.658157183877095e-01}, + {-1.127955043284965e-09, 7.414254914366063e-01}, + {-1.111368957870986e-09, 7.169378722095157e-01}, + {-1.094741030044308e-09, 6.923536058430697e-01}, + {-1.078071885836393e-09, 6.676734437331688e-01}, + {-1.061362152831423e-09, 6.428981435165511e-01}, + {-1.044612460141255e-09, 6.180284690466404e-01}, + {-1.027823438382183e-09, 5.930651903718045e-01}, + {-1.010995719652015e-09, 5.680090837138436e-01}, + {-9.941299375042378e-10, 5.428609314418970e-01}, + {-9.772267269262058e-10, 5.176215220520872e-01}, + {-9.602867243141016e-10, 4.922916501421032e-01}, + {-9.433105674499058e-10, 4.668721163885412e-01}, + {-9.262988954758817e-10, 4.413637275202624e-01}, + {-9.092523488719689e-10, 4.157672962958654e-01}, + {-8.921715694311144e-10, 3.900836414778084e-01}, + {-8.750572002347607e-10, 3.643135878065193e-01}, + {-8.579098856296589e-10, 3.384579659762392e-01}, + {-8.407302712022458e-10, 3.125176126069478e-01}, + {-8.235190037551917e-10, 2.864933702193017e-01}, + {-8.062767312831008e-10, 2.603860872080448e-01}, + {-7.890041029479477e-10, 2.341966178147619e-01}, + {-7.717017690542486e-10, 2.079258220999725e-01}, + {-7.543703810250266e-10, 1.815745659161734e-01}, + {-7.370105913774597e-10, 1.551437208801425e-01}, + {-7.196230536974697e-10, 1.286341643433767e-01}, + {-7.022084226165876e-10, 1.020467793657360e-01}, + {-6.847673537853251e-10, 7.538245468350446e-02}, + {-6.673005038502516e-10, 4.864208468284503e-02}, + {-6.498085304282128e-10, 2.182656936863137e-02}, + {-6.322920920826137e-10, -5.063185663820913e-03}, + {-6.147518482969490e-10, -3.202626926150343e-02}, + {-5.971884594516681e-10, -5.906176474160862e-02}, + {-5.796025867984469e-10, -8.616874992366363e-02}, + {-5.619948924353588e-10, -1.133462971605448e-01}, + {-5.443660392823640e-10, -1.405934733692621e-01}, + {-5.267166910556339e-10, -1.679093400638023e-01}, + {-5.090475122431451e-10, -1.952929533862739e-01}, + {-4.913591680795342e-10, -2.227433641394564e-01}, + {-4.736523245210571e-10, -2.502596178194491e-01}, + {-4.559276482202303e-10, -2.778407546490776e-01}, + {-4.381858065011618e-10, -3.054858096104932e-01}, + {-4.204274673340870e-10, -3.331938124792702e-01}, + {-4.026532993105397e-10, -3.609637878577768e-01}, + {-3.848639716178888e-10, -3.887947552098022e-01}, + {-3.670601540142443e-10, -4.166857288948674e-01}, + {-3.492425168032583e-10, -4.446357182029681e-01}, + {-3.314117308088734e-10, -4.726437273896633e-01}, + {-3.135684673501752e-10, -5.007087557112619e-01}, + {-2.957133982159296e-10, -5.288297974607742e-01}, + {-2.778471956393828e-10, -5.570058420037128e-01}, + {-2.599705322729564e-10, -5.852358738143247e-01}, + {-2.420840811628366e-10, -6.135188725122560e-01}, + {-2.241885157240923e-10, -6.418538128986450e-01}, + {-2.062845097142585e-10, -6.702396649949099e-01}, + {-1.883727372093546e-10, -6.986753940779493e-01}, + {-1.704538725773087e-10, -7.271599607197149e-01}, + {-1.525285904532877e-10, -7.556923208240308e-01}, + {-1.345975657140748e-10, -7.842714256651911e-01}, + {-1.166614734526054e-10, -8.128962219265712e-01}, + {-9.872098895260891e-11, -8.415656517393372e-01}, + {-8.077678766314517e-11, -8.702786527215916e-01}, + {-6.282954517324612e-11, -8.990341580176152e-01}, + {-4.487993718655790e-11, -9.278310963373758e-01}, + {-2.692863949561210e-11, -9.566683919968972e-01}, + {-8.976327956520795e-12, -9.855449649582175e-01}, + {8.976321536169872e-12, -1.014459730869357e+00}, + {2.692863307547294e-11, -1.043411601105914e+00}, + {4.487993076694813e-11, -1.072399482811314e+00}, + {6.282953875437751e-11, -1.101422278938424e+00}, + {8.077678124517653e-11, -1.130478888291020e+00}, + {9.872098253591082e-11, -1.159568205565684e+00}, + {1.166614670373367e-10, -1.188689121393192e+00}, + {1.345975593005002e-10, -1.217840522381901e+00}, + {1.525285840416718e-10, -1.247021291159495e+00}, + {1.704538661678104e-10, -1.276230306415868e+00}, + {1.883727308022916e-10, -1.305466442946703e+00}, + {2.062845033098954e-10, -1.334728571696106e+00}, + {2.241885093225349e-10, -1.364015559800721e+00}, + {2.420840747645085e-10, -1.393326270633325e+00}, + {2.599705258779635e-10, -1.422659563847049e+00}, + {2.778471892479898e-10, -1.452014295419243e+00}, + {2.957133918284542e-10, -1.481389317696831e+00}, + {3.135684609667761e-10, -1.510783479440191e+00}, + {3.314117244297624e-10, -1.540195625869043e+00}, + {3.492425104288060e-10, -1.569624598707558e+00}, + {3.670601476445565e-10, -1.599069236228850e+00}, + {3.848639652533361e-10, -1.628528373302631e+00}, + {4.026532929512281e-10, -1.658000841439269e+00}, + {4.204274609803869e-10, -1.687485468837799e+00}, + {4.381858001531792e-10, -1.716981080430596e+00}, + {4.559276418782829e-10, -1.746486497931567e+00}, + {4.736523181853565e-10, -1.776000539882225e+00}, + {4.913591617503452e-10, -1.805522021699094e+00}, + {5.090475059206794e-10, -1.835049755721194e+00}, + {5.267166847401562e-10, -1.864582551257262e+00}, + {5.443660329740862e-10, -1.894119214633676e+00}, + {5.619948861345454e-10, -1.923658549242818e+00}, + {5.796025805053097e-10, -1.953199355591180e+00}, + {5.971884531664190e-10, -1.982740431347091e+00}, + {6.147518420199055e-10, -2.012280571390674e+00}, + {6.322920858139346e-10, -2.041818567861395e+00}, + {6.498085241682158e-10, -2.071353210208005e+00}, + {6.673004975990425e-10, -2.100883285238127e+00}, + {6.847673475432746e-10, -2.130407577166309e+00}, + {7.022084163838545e-10, -2.159924867664933e+00}, + {7.196230474743716e-10, -2.189433935913779e+00}, + {7.370105851640495e-10, -2.218933558650552e+00}, + {7.543703748217808e-10, -2.248422510220072e+00}, + {7.717017628611672e-10, -2.277899562625407e+00}, + {7.890040967654542e-10, -2.307363485579104e+00}, + {8.062767251113011e-10, -2.336813046552684e+00}, + {8.235189975944034e-10, -2.366247010829556e+00}, + {8.407302650525749e-10, -2.395664141553858e+00}, + {8.579098794915287e-10, -2.425063199784153e+00}, + {8.750571941082773e-10, -2.454442944543319e+00}, + {8.921715633164894e-10, -2.483802132872044e+00}, + {9.092523427695200e-10, -2.513139519878584e+00}, + {9.262988893857148e-10, -2.542453858792682e+00}, + {9.433105613723914e-10, -2.571743901017465e+00}, + {9.602867182493987e-10, -2.601008396180870e+00}, + {9.772267208744730e-10, -2.630246092190425e+00}, + {9.941299314658458e-10, -2.659455735283526e+00}, + {1.010995713627070e-09, -2.688636070081818e+00}, + {1.027823432371055e-09, -2.717785839644439e+00}, + {1.044612454143997e-09, -2.746903785521352e+00}, + {1.061362146848353e-09, -2.775988647805256e+00}, + {1.078071879867828e-09, -2.805039165187255e+00}, + {1.094741024090249e-09, -2.834054075009077e+00}, + {1.111368951931856e-09, -2.863032113318052e+00}, + {1.127955037360817e-09, -2.891972014920939e+00}, + {1.144498655920037e-09, -2.920872513436805e+00}, + {1.160999184751779e-09, -2.949732341353290e+00}, + {1.177456002620215e-09, -2.978550230079517e+00}, + {1.193868489936097e-09, -3.007324910002949e+00}, + {1.210236028777826e-09, -3.036055110540183e+00}, + {1.226558002917232e-09, -3.064739560196251e+00}, + {1.242833797841123e-09, -3.093376986616735e+00}, + {1.259062800774685e-09, -3.121966116643377e+00}, + {1.275244400705935e-09, -3.150505676371791e+00}, + {1.291377988406056e-09, -3.178994391202159e+00}, + {1.307462956454857e-09, -3.207430985899192e+00}, + {1.323498699262108e-09, -3.235814184645077e+00}, + {1.339484613091842e-09, -3.264142711097884e+00}, + {1.355420096082785e-09, -3.292415288443373e+00}, + {1.371304548273191e-09, -3.320630639454825e+00}, + {1.387137371622433e-09, -3.348787486547389e+00}, + {1.402917970033511e-09, -3.376884551834256e+00}, + {1.418645749376393e-09, -3.404920557184582e+00}, + {1.434320117508396e-09, -3.432894224276359e+00}, + {1.449940484298756e-09, -3.460804274656981e+00}, + {1.465506261649108e-09, -3.488649429796768e+00}, + {1.481016863517580e-09, -3.516428411149154e+00}, + {1.496471705937951e-09, -3.544139940202303e+00}, + {1.511870207044433e-09, -3.571782738540999e+00}, + {1.527211787092206e-09, -3.599355527901174e+00}, + {1.542495868479076e-09, -3.626857030226671e+00}, + {1.557721875768920e-09, -3.654285967729458e+00}, + {1.572889235710329e-09, -3.681641062941412e+00}, + {1.587997377261005e-09, -3.708921038776707e+00}, + {1.603045731607830e-09, -3.736124618586623e+00}, + {1.618033732189314e-09, -3.763250526218862e+00}, + {1.632960814715177e-09, -3.790297486071938e+00}, + {1.647826417189275e-09, -3.817264223155802e+00}, + {1.662629979930247e-09, -3.844149463148589e+00}, + {1.677370945591844e-09, -3.870951932452996e+00}, + {1.692048759186008e-09, -3.897670358257890e+00}, + {1.706662868100504e-09, -3.924303468590212e+00}, + {1.721212722122685e-09, -3.950849992378278e+00}, + {1.735697773458400e-09, -3.977308659506432e+00}, + {1.750117476754591e-09, -4.003678200876669e+00}, + {1.764471289116712e-09, -4.029957348461003e+00}, + {1.778758670132079e-09, -4.056144835364877e+00}, + {1.792979081888926e-09, -4.082239395882965e+00}, + {1.807131988996465e-09, -4.108239765556996e+00}, + {1.821216858606652e-09, -4.134144681236933e+00}, + {1.835233160431175e-09, -4.159952881133585e+00}, + {1.849180366764537e-09, -4.185663104882633e+00}, + {1.863057952502055e-09, -4.211274093599509e+00}, + {1.876865395161145e-09, -4.236784589940537e+00}, + {1.890602174898734e-09, -4.262193338157148e+00}, + {1.904267774533022e-09, -4.287499084158302e+00}, + {1.917861679562008e-09, -4.312700575567174e+00}, + {1.931383378182392e-09, -4.337796561778708e+00}, + {1.944832361310856e-09, -4.362785794021793e+00}, + {1.958208122599839e-09, -4.387667025411434e+00}, + {1.971510158459931e-09, -4.412439011013396e+00}, + {1.984737968076495e-09, -4.437100507898339e+00}, + {1.997891053431005e-09, -4.461650275204912e+00}, + {2.010968919316289e-09, -4.486087074191693e+00}, + {2.023971073358447e-09, -4.510409668301784e+00}, + {2.036897026033634e-09, -4.534616823217992e+00}, + {2.049746290686799e-09, -4.558707306921882e+00}, + {2.062518383551274e-09, -4.582679889754607e+00}, + {2.075212823764071e-09, -4.606533344469879e+00}, + {2.087829133387063e-09, -4.630266446298172e+00}, + {2.100366837422912e-09, -4.653877973001258e+00}, + {2.112825463835087e-09, -4.677366704934605e+00}, + {2.125204543562522e-09, -4.700731425099899e+00}, + {2.137503610540056e-09, -4.723970919208608e+00}, + {2.149722201714786e-09, -4.747083975738060e+00}, + {2.161859857063438e-09, -4.770069385989595e+00}, + {2.173916119610994e-09, -4.792925944149308e+00}, + {2.185890535445098e-09, -4.815652447340950e+00}, + {2.197782653735957e-09, -4.838247695689436e+00}, + {2.209592026751962e-09, -4.860710492376411e+00}, + {2.221318209877576e-09, -4.883039643700314e+00}, + {2.232960761627846e-09, -4.905233959130168e+00}, + {2.244519243667616e-09, -4.927292251368517e+00}, + {2.255993220826402e-09, -4.949213336406265e+00}, + {2.267382261115285e-09, -4.970996033581527e+00}, + {2.278685935744269e-09, -4.992639165639563e+00}, + {2.289903819135414e-09, -5.014141558784778e+00}, + {2.301035488942000e-09, -5.035502042744443e+00}, + {2.312080526062763e-09, -5.056719450823151e+00}, + {2.323038514659161e-09, -5.077792619963239e+00}, + {2.333909042168180e-09, -5.098720390796817e+00}, + {2.344691699320969e-09, -5.119501607709159e+00}, + {2.355386080156553e-09, -5.140135118892792e+00}, + {2.365991782037187e-09, -5.160619776404897e+00}, + {2.376508405665132e-09, -5.180954436227641e+00}, + {2.386935555094626e-09, -5.201137958319343e+00}, + {2.397272837749508e-09, -5.221169206676762e+00}, + {2.407519864436774e-09, -5.241047049389645e+00}, + {2.417676249362563e-09, -5.260770358700167e+00}, + {2.427741610143750e-09, -5.280338011053974e+00}, + {2.437715567825576e-09, -5.299748887163106e+00}, + {2.447597746894037e-09, -5.319001872058887e+00}, + {2.457387775290440e-09, -5.338095855149190e+00}, + {2.467085284426756e-09, -5.357029730277389e+00}, + {2.476689909196263e-09, -5.375802395772283e+00}, + {2.486201287990485e-09, -5.394412754510426e+00}, + {2.495619062711154e-09, -5.412859713968929e+00}, + {2.504942878785408e-09, -5.431142186284682e+00}, + {2.514172385175743e-09, -5.449259088303476e+00}, + {2.523307234396791e-09, -5.467209341642627e+00}, + {2.532347082526785e-09, -5.484991872743321e+00}, + {2.541291589219998e-09, -5.502605612925014e+00}, + {2.550140417722072e-09, -5.520049498445633e+00}, + {2.558893234878378e-09, -5.537322470548212e+00}, + {2.567549711150773e-09, -5.554423475524196e+00}, + {2.576109520627371e-09, -5.571351464763084e+00}, + {2.584572341037361e-09, -5.588105394812198e+00}, + {2.592937853759161e-09, -5.604684227423386e+00}, + {2.601205743836355e-09, -5.621086929615246e+00}, + {2.609375699987564e-09, -5.637312473723475e+00}, + {2.617447414618146e-09, -5.653359837454964e+00}, + {2.625420583833750e-09, -5.669228003945694e+00}, + {2.633294907447937e-09, -5.684915961806963e+00}, + {2.641070088997271e-09, -5.700422705186584e+00}, + {2.648745835750128e-09, -5.715747233817712e+00}, + {2.656321858720176e-09, -5.730888553077074e+00}, + {2.663797872673252e-09, -5.745845674030161e+00}, + {2.671173596142054e-09, -5.760617613492118e+00}, + {2.678448751434797e-09, -5.775203394076705e+00}, + {2.685623064645538e-09, -5.789602044248679e+00}, + {2.692696265666640e-09, -5.803812598380606e+00}, + {2.699668088194915e-09, -5.817834096797069e+00}, + {2.706538269745573e-09, -5.831665585834668e+00}, + {2.713306551659817e-09, -5.845306117889361e+00}, + {2.719972679116734e-09, -5.858754751472542e+00}, + {2.726536401139295e-09, -5.872010551255358e+00}, + {2.732997470607439e-09, -5.885072588127400e+00}, + {2.739355644265558e-09, -5.897939939244211e+00}, + {2.745610682731633e-09, -5.910611688078208e+00}, + {2.751762350508137e-09, -5.923086924473290e+00}, + {2.757810415987146e-09, -5.935364744687794e+00}, + {2.763754651462700e-09, -5.947444251452243e+00}, + {2.769594833137415e-09, -5.959324554015538e+00}, + {2.775330741132843e-09, -5.971004768198829e+00}, + {2.780962159494174e-09, -5.982484016437981e+00}, + {2.786488876202047e-09, -5.993761427840588e+00}, + {2.791910683178690e-09, -6.004836138231525e+00}, + {2.797227376295779e-09, -6.015707290202086e+00}, + {2.802438755383971e-09, -6.026374033162623e+00}, + {2.807544624236659e-09, -6.036835523383457e+00}, + {2.812544790621093e-09, -6.047090924050914e+00}, + {2.817439066283459e-09, -6.057139405311101e+00}, + {2.822227266958278e-09, -6.066980144322601e+00}, + {2.826909212371261e-09, -6.076612325295799e+00}, + {2.831484726250221e-09, -6.086035139548830e+00}, + {2.835953636329660e-09, -6.095247785550617e+00}, + {2.840315774357203e-09, -6.104249468967751e+00}, + {2.844570976102082e-09, -6.113039402715685e+00}, + {2.848719081357095e-09, -6.121616806996519e+00}, + {2.852759933948860e-09, -6.129980909353977e+00}, + {2.856693381741114e-09, -6.138130944714082e+00}, + {2.860519276643053e-09, -6.146066155436312e+00}, + {2.864237474610633e-09, -6.153785791350256e+00}, + {2.867847835656203e-09, -6.161289109809551e+00}, + {2.871350223851726e-09, -6.168575375732642e+00}, + {2.874744507333867e-09, -6.175643861647406e+00}, + {2.878030558310989e-09, -6.182493847739853e+00}, + {2.881208253063899e-09, -6.189124621889823e+00}, + {2.884277471954592e-09, -6.195535479723423e+00}, + {2.887238099428306e-09, -6.201725724651554e+00}, + {2.890090024020323e-09, -6.207694667918394e+00}, + {2.892833138356060e-09, -6.213441628635915e+00}, + {2.895467339159240e-09, -6.218965933835304e+00}, + {2.897992527253659e-09, -6.224266918505075e+00}, + {2.900408607567016e-09, -6.229343925633495e+00}, + {2.902715489136496e-09, -6.234196306254763e+00}, + {2.904913085108075e-09, -6.238823419482017e+00}, + {2.907001312743911e-09, -6.243224632557377e+00}, + {2.908980093422997e-09, -6.247399320887848e+00}, + {2.910849352646620e-09, -6.251346868091392e+00}, + {2.912609020036956e-09, -6.255066666028537e+00}, + {2.914259029343965e-09, -6.258558114851525e+00}, + {2.915799318445710e-09, -6.261820623039620e+00}, + {2.917229829350759e-09, -6.264853607438842e+00}, + {2.918550508202463e-09, -6.267656493305673e+00}, + {2.919761305276718e-09, -6.270228714337005e+00}, + {2.920862174988150e-09, -6.272569712717951e+00}, + {2.921853075889193e-09, -6.274678939154603e+00}, + {2.922733970674264e-09, -6.276555852917634e+00}, + {2.923504826176907e-09, -6.278199921870962e+00}, + {2.924165613375264e-09, -6.279610622518139e+00}, + {2.924716307391075e-09, -6.280787440034993e+00}, + {2.925156887490598e-09, -6.281729868306345e+00}, + {2.925487337087508e-09, -6.282437409966992e+00}, + {2.925707643739298e-09, -6.282909576428774e+00}, + {2.925817799151970e-09, -6.283145887925411e+00}, }; #endif diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h index f48e84aa1..6f5b25673 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse3_intrinsics.h @@ -30,33 +30,35 @@ static inline __m128 _mm_complexmul_ps(__m128 x, __m128 y) { - __m128 yl, yh, tmp1, tmp2; - yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + __m128 yl, yh, tmp1, tmp2; + yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di } static inline __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y) { - const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); - y = _mm_xor_ps(y, conjugator); // conjugate y - return _mm_complexmul_ps(x, y); + const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); + y = _mm_xor_ps(y, conjugator); // conjugate y + return _mm_complexmul_ps(x, y); } static inline __m128 -_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ - cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values - cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values - return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values +_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +{ + cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values + cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values + return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values } static inline __m128 -_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ - return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); +_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2) +{ + return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); } #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse_intrinsics.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse_intrinsics.h index 6136efba3..9de170708 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse_intrinsics.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/include/volk_gnsssdr/volk_gnsssdr_sse_intrinsics.h @@ -27,20 +27,22 @@ #include static inline __m128 -_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){ - __m128 iValue, qValue; - // Arrange in i1i2i3i4 format - iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); - // Arrange in q1q2q3q4 format - qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); - iValue = _mm_mul_ps(iValue, iValue); // Square the I values - qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values - return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values +_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2) +{ + __m128 iValue, qValue; + // Arrange in i1i2i3i4 format + iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); + // Arrange in q1q2q3q4 format + qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1)); + iValue = _mm_mul_ps(iValue, iValue); // Square the I values + qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values + return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values } static inline __m128 -_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){ - return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); +_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2) +{ + return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); } #endif /* INCLUDED_VOLK_VOLK_SSE_INTRINSICS_H_ */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h index 3c1c0f817..ffce85d32 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_resamplerxnpuppet_16i.h @@ -45,55 +45,55 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; unsigned int n; float rem_code_phase_chips = -0.234; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif /* LV_HAVE_GENERIC */ - + #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -103,26 +103,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -133,26 +133,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -163,26 +163,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* resu static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -193,26 +193,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* resu static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -223,26 +223,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -253,30 +253,29 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H - +#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h index 0d09df273..3628ccf8c 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16i_xn_resampler_16i_xn.h @@ -107,7 +107,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -121,7 +122,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -139,13 +140,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -157,7 +158,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul } } -#endif +#endif #ifdef LV_HAVE_SSE4_1 @@ -173,7 +174,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -187,7 +189,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -205,13 +207,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -240,7 +242,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -254,7 +257,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -275,13 +278,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result, aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -310,7 +313,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -324,7 +328,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -345,13 +349,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result, aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -379,7 +383,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -394,7 +399,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -412,13 +417,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -428,7 +433,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result, _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -456,7 +461,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -471,7 +477,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -489,13 +495,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -505,7 +511,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result, _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -531,7 +537,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int32_t local_code_chip_index[4]; int32_t local_code_chip_index_; const int32x4_t zeros = vdupq_n_s32(0); @@ -539,11 +546,12 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); int32x4_t local_code_chip_index_reg, aux_i, negatives, i; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; - __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f}; uint32x4_t igx; reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! float32x4_t n0 = vld1q_f32((float*)vec); int current_correlator_tap; unsigned int n; @@ -553,7 +561,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < neon_iters; n++) + for (n = 0; n < neon_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); @@ -569,7 +577,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c // fmod c = vmulq_f32(aux, reciprocal); - i = vcvtq_s32_f32(c); + i = vcvtq_s32_f32(c); cTrunc = vcvtq_f32_s32(i); base = vmulq_f32(cTrunc, code_length_chips_reg_f); aux = vsubq_f32(aux, base); @@ -581,13 +589,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = vaddq_f32(indexn, fours); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap @@ -605,4 +613,3 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c #endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h index 230401ccb..fbf7e31f1 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn.h @@ -86,11 +86,11 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(lv_16s unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points; n++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); @@ -131,14 +131,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload const unsigned int ROTATOR_RELOAD = 256; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) { for (j = 0; j < ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; @@ -149,7 +149,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } - /* Regenerate phase */ + /* Regenerate phase */ #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -160,13 +160,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload for (j = 0; j < num_points % ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - lv_16sc_t tmp = tmp16 * in_a[n_vec][ (num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j ]; + lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } @@ -179,9 +179,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; int n_vec; @@ -192,7 +192,8 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -206,11 +207,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -218,62 +221,62 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - a = _mm_unpacklo_epi16( a, a ); + a = _mm_unpacklo_epi16(a, a); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... cacc[n_vec] = _mm_adds_epi16(cacc[n_vec], c); } @@ -290,14 +293,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = cacc[n_vec]; - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -313,7 +315,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -325,7 +327,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -337,245 +339,245 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc //static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) //{ - //lv_16sc_t dotProduct = lv_cmake(0,0); +//lv_16sc_t dotProduct = lv_cmake(0,0); - //const unsigned int sse_iters = num_points / 4; - //const unsigned int ROTATOR_RELOAD = 128; - //int n_vec; - //int i; - //unsigned int number; - //unsigned int j; - //unsigned int n; +//const unsigned int sse_iters = num_points / 4; +//const unsigned int ROTATOR_RELOAD = 128; +//int n_vec; +//int i; +//unsigned int number; +//unsigned int j; +//unsigned int n; - //const int16_t** _in_a = in_a; - //const lv_16sc_t* _in_common = in_common; - //lv_16sc_t* _out = result; +//const int16_t** _in_a = in_a; +//const lv_16sc_t* _in_common = in_common; +//lv_16sc_t* _out = result; - //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - //__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); - //__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); +//__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); +//__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //realcacc[n_vec] = _mm_setzero_si128(); - //imagcacc[n_vec] = _mm_setzero_si128(); - //} +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//realcacc[n_vec] = _mm_setzero_si128(); +//imagcacc[n_vec] = _mm_setzero_si128(); +//} - //__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl; +//__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl; - //mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); - //mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); +//mask_imag = _mm_set_epi8(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0); +//mask_real = _mm_set_epi8(0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255); - //// phase rotation registers - //__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; - //__m128i pc1, pc2; - //__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; - //two_phase_inc[0] = phase_inc * phase_inc; - //two_phase_inc[1] = phase_inc * phase_inc; - //two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - //__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; - //two_phase_acc[0] = (*phase); - //two_phase_acc[1] = (*phase) * phase_inc; - //two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); - //__m128 yl, yh, tmp1, tmp2, tmp3; - //lv_16sc_t tmp16; - //lv_32fc_t tmp32; +//// phase rotation registers +//__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; +//__m128i pc1, pc2; +//__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; +//two_phase_inc[0] = phase_inc * phase_inc; +//two_phase_inc[1] = phase_inc * phase_inc; +//two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); +//__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; +//two_phase_acc[0] = (*phase); +//two_phase_acc[1] = (*phase) * phase_inc; +//two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); +//__m128 yl, yh, tmp1, tmp2, tmp3; +//lv_16sc_t tmp16; +//lv_32fc_t tmp32; - //for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) - //{ - //for (j = 0; j < ROTATOR_RELOAD; j++) - //{ - //// Phase rotation on operand in_common starts here: - ////printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - ////complex 32fc multiplication b=a*two_phase_acc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic +//for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) +//{ +//for (j = 0; j < ROTATOR_RELOAD; j++) +//{ +//// Phase rotation on operand in_common starts here: +////printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); +//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +////complex 32fc multiplication b=a*two_phase_acc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +//pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - ////next two samples - //_in_common += 2; - //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - ////complex 32fc multiplication b=a*two_phase_acc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic +////next two samples +//_in_common += 2; +//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +////complex 32fc multiplication b=a*two_phase_acc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +//pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //// store four rotated in_common samples in the register b - //b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic +//// store four rotated in_common samples in the register b +//b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic - ////next two samples - //_in_common += 2; +////next two samples +//_in_common += 2; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - //c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... +//c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - //c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - //real = _mm_subs_epi16(c, c_sr); +//c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. +//real = _mm_subs_epi16(c, c_sr); - //b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - //a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +//b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +//a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - //imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - //imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +//imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +//imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - //imag = _mm_adds_epi16(imag1, imag2); +//imag = _mm_adds_epi16(imag1, imag2); - //realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); - //imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); - //} - //} - //// regenerate phase - //tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); - //tmp2 = _mm_hadd_ps(tmp1, tmp1); - //tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - //tmp2 = _mm_sqrt_ps(tmp1); - //two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); - //} +//realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); +//imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); +//} +//} +//// regenerate phase +//tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); +//tmp2 = _mm_hadd_ps(tmp1, tmp1); +//tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); +//tmp2 = _mm_sqrt_ps(tmp1); +//two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); +//} - //for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) - //{ - //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - ////complex 32fc multiplication b=a*two_phase_acc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic +//for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) +//{ +//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +////complex 32fc multiplication b=a*two_phase_acc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +//pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - ////next two samples - //_in_common += 2; - //pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - ////complex 32fc multiplication b=a*two_phase_acc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic +////next two samples +//_in_common += 2; +//pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +////complex 32fc multiplication b=a*two_phase_acc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +//pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic - ////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - //yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - //yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - //tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - //tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - //tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - //two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di +////complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg +//yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr +//yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di +//tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr +//tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br +//tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di +//two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //// store four rotated in_common samples in the register b - //b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic +//// store four rotated in_common samples in the register b +//b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic - ////next two samples - //_in_common += 2; +////next two samples +//_in_common += 2; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - //c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... +//c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - //c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - //real = _mm_subs_epi16(c, c_sr); +//c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. +//real = _mm_subs_epi16(c, c_sr); - //b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - //a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... +//b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... +//a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - //imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - //imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... +//imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... +//imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - //imag = _mm_adds_epi16(imag1, imag2); +//imag = _mm_adds_epi16(imag1, imag2); - //realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); - //imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); - //} - //} +//realcacc[n_vec] = _mm_adds_epi16(realcacc[n_vec], real); +//imagcacc[n_vec] = _mm_adds_epi16(imagcacc[n_vec], imag); +//} +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real); - //imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//realcacc[n_vec] = _mm_and_si128(realcacc[n_vec], mask_real); +//imagcacc[n_vec] = _mm_and_si128(imagcacc[n_vec], mask_imag); - //a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); +//a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - //_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - //dotProduct = lv_cmake(0,0); - //for (i = 0; i < 4; ++i) - //{ - //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); - //} - //_out[n_vec] = dotProduct; - //} +//_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector +//dotProduct = lv_cmake(0,0); +//for (i = 0; i < 4; ++i) +//{ +//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), +//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); +//} +//_out[n_vec] = dotProduct; +//} - //volk_gnsssdr_free(realcacc); - //volk_gnsssdr_free(imagcacc); +//volk_gnsssdr_free(realcacc); +//volk_gnsssdr_free(imagcacc); - //tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); - //tmp2 = _mm_hadd_ps(tmp1, tmp1); - //tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - //tmp2 = _mm_sqrt_ps(tmp1); - //two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); +//tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); +//tmp2 = _mm_hadd_ps(tmp1, tmp1); +//tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); +//tmp2 = _mm_sqrt_ps(tmp1); +//two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); - //_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); - ////(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); - //(*phase) = two_phase_acc[0]; +//_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); +////(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); +//(*phase) = two_phase_acc[0]; - //for(n = sse_iters * 4; n < num_points; n++) - //{ - //tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); - //tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); - //tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); - //(*phase) *= phase_inc; +//for(n = sse_iters * 4; n < num_points; n++) +//{ +//tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); +//tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); +//tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); +//(*phase) *= phase_inc; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; - ////lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); - //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - //sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); - //} - //} +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; +////lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); +//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), +//sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); +//} +//} //} //#endif [> LV_HAVE_SSE3 <] @@ -584,9 +586,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; int n_vec; @@ -597,7 +599,8 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -611,11 +614,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -623,62 +628,62 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadl_epi64((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - a = _mm_unpacklo_epi16( a, a ); + a = _mm_unpacklo_epi16(a, a); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... cacc[n_vec] = _mm_adds_epi16(cacc[n_vec], c); } @@ -695,14 +700,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = cacc[n_vec]; - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -718,7 +722,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -730,7 +734,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -742,7 +746,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc #include #include -static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int avx2_iters = num_points / 8; const int16_t** _in_a = in_a; @@ -755,8 +759,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc lv_16sc_t tmp16; lv_32fc_t tmp32; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - lv_16sc_t dotProduct = lv_cmake(0,0); + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0, 0); __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -771,7 +776,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc __m256 four_phase_acc_reg, four_phase_inc_reg; - lv_32fc_t _phase_inc = phase_inc*phase_inc*phase_inc*phase_inc; + lv_32fc_t _phase_inc = phase_inc * phase_inc * phase_inc * phase_inc; // Normalise the 4*phase increment #ifdef __cplusplus @@ -780,55 +785,57 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc)); #endif - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; - for( n = 0; n < 4; ++n ) - { - four_phase_inc[n] = _phase_inc; - four_phase_acc[n] = *phase; - *phase *= phase_inc; - } - four_phase_acc_reg = _mm256_load_ps((float*) four_phase_acc); - four_phase_inc_reg = _mm256_load_ps((float*) four_phase_inc); + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_acc[4]; + for (n = 0; n < 4; ++n) + { + four_phase_inc[n] = _phase_inc; + four_phase_acc[n] = *phase; + *phase *= phase_inc; + } + four_phase_acc_reg = _mm256_load_ps((float*)four_phase_acc); + four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); __m256i a2, b2, c, c1, c2, perm_idx; - perm_idx = _mm256_set_epi32( 7, 6, 3, 2, 5, 4, 1, 0); + perm_idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - a128 = _mm_load_si128( (__m128i *)_in_common ); - ai = _mm256_cvtepi16_epi32( a128 ); - a = _mm256_cvtepi32_ps( ai ); + a128 = _mm_load_si128((__m128i*)_in_common); + ai = _mm256_cvtepi16_epi32(a128); + a = _mm256_cvtepi32_ps(ai); //complex 32fc multiplication b=a*two_phase_acc_reg - b = _mm256_complexmul_ps( a, four_phase_acc_reg ); - c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + b = _mm256_complexmul_ps(a, four_phase_acc_reg); + c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); //next four samples _in_common += 4; - a128 = _mm_load_si128( (__m128i *)_in_common ); - ai = _mm256_cvtepi16_epi32( a128 ); - a = _mm256_cvtepi32_ps( ai ); + a128 = _mm_load_si128((__m128i*)_in_common); + ai = _mm256_cvtepi16_epi32(a128); + a = _mm256_cvtepi32_ps(ai); //complex 32fc multiplication b=a*two_phase_acc_reg - b = _mm256_complexmul_ps( a, four_phase_acc_reg ); - c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + b = _mm256_complexmul_ps(a, four_phase_acc_reg); + c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); // Store and convert 32ic to 16ic: - b2 = _mm256_packs_epi32( c1, c2 ); + b2 = _mm256_packs_epi32(c1, c2); - b2 = _mm256_permutevar8x32_epi32( b2, perm_idx ); + b2 = _mm256_permutevar8x32_epi32(b2, perm_idx); _in_common += 4; @@ -836,10 +843,10 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc { ain_128 = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 8])); - ain_128_lo = _mm_unpacklo_epi16( ain_128, ain_128 ); - ain_128_hi = _mm_unpackhi_epi16( ain_128, ain_128 ); + ain_128_lo = _mm_unpacklo_epi16(ain_128, ain_128); + ain_128_hi = _mm_unpackhi_epi16(ain_128, ain_128); - a2 = _mm256_insertf128_si256( _mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); + a2 = _mm256_insertf128_si256(_mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); c = _mm256_mullo_epi16(a2, b2); @@ -856,12 +863,12 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc { a2 = cacc[n_vec]; - _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (number = 0; number < 8; ++number) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } _out[n_vec] = dotProduct; } @@ -872,7 +879,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); (*phase) = four_phase_acc[0]; - for(n = avx2_iters * 8; n < num_points; n++) + for (n = avx2_iters * 8; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -882,10 +889,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } - } #endif /* LV_HAVE_AVX2 */ @@ -894,7 +900,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc #include #include -static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int avx2_iters = num_points / 8; const int16_t** _in_a = in_a; @@ -907,8 +913,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc lv_16sc_t tmp16; lv_32fc_t tmp32; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - lv_16sc_t dotProduct = lv_cmake(0,0); + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0, 0); __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -923,7 +930,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc __m256 four_phase_acc_reg, four_phase_inc_reg; - lv_32fc_t _phase_inc = phase_inc*phase_inc*phase_inc*phase_inc; + lv_32fc_t _phase_inc = phase_inc * phase_inc * phase_inc * phase_inc; // Normalise the 4*phase increment #ifdef __cplusplus @@ -932,55 +939,57 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc)); #endif - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; - for( n = 0; n < 4; ++n ) - { - four_phase_inc[n] = _phase_inc; - four_phase_acc[n] = *phase; - *phase *= phase_inc; - } - four_phase_acc_reg = _mm256_load_ps((float*) four_phase_acc); - four_phase_inc_reg = _mm256_load_ps((float*) four_phase_inc); + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_acc[4]; + for (n = 0; n < 4; ++n) + { + four_phase_inc[n] = _phase_inc; + four_phase_acc[n] = *phase; + *phase *= phase_inc; + } + four_phase_acc_reg = _mm256_load_ps((float*)four_phase_acc); + four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); __m256i a2, b2, c, c1, c2, perm_idx; - perm_idx = _mm256_set_epi32( 7, 6, 3, 2, 5, 4, 1, 0); + perm_idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0); //perm_idx = _mm256_set_epi32( 0, 1, 4, 5, 2, 3, 6, 7); - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - a128 = _mm_loadu_si128( (__m128i *)_in_common ); - ai = _mm256_cvtepi16_epi32( a128 ); - a = _mm256_cvtepi32_ps( ai ); + a128 = _mm_loadu_si128((__m128i*)_in_common); + ai = _mm256_cvtepi16_epi32(a128); + a = _mm256_cvtepi32_ps(ai); //complex 32fc multiplication b=a*two_phase_acc_reg - b = _mm256_complexmul_ps( a, four_phase_acc_reg ); - c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + b = _mm256_complexmul_ps(a, four_phase_acc_reg); + c1 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); //next four samples _in_common += 4; - a128 = _mm_loadu_si128( (__m128i *)_in_common ); - ai = _mm256_cvtepi16_epi32( a128 ); - a = _mm256_cvtepi32_ps( ai ); + a128 = _mm_loadu_si128((__m128i*)_in_common); + ai = _mm256_cvtepi16_epi32(a128); + a = _mm256_cvtepi32_ps(ai); //complex 32fc multiplication b=a*two_phase_acc_reg - b = _mm256_complexmul_ps( a, four_phase_acc_reg ); - c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic + b = _mm256_complexmul_ps(a, four_phase_acc_reg); + c2 = _mm256_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - four_phase_acc_reg = _mm256_complexmul_ps( four_phase_inc_reg, four_phase_acc_reg ); + four_phase_acc_reg = _mm256_complexmul_ps(four_phase_inc_reg, four_phase_acc_reg); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); // Store and convert 32ic to 16ic: - b2 = _mm256_packs_epi32( c1, c2 ); + b2 = _mm256_packs_epi32(c1, c2); - b2 = _mm256_permutevar8x32_epi32( b2, perm_idx ); + b2 = _mm256_permutevar8x32_epi32(b2, perm_idx); _in_common += 4; @@ -988,10 +997,10 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc { ain_128 = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 8])); - ain_128_lo = _mm_unpacklo_epi16( ain_128, ain_128 ); - ain_128_hi = _mm_unpackhi_epi16( ain_128, ain_128 ); + ain_128_lo = _mm_unpacklo_epi16(ain_128, ain_128); + ain_128_hi = _mm_unpackhi_epi16(ain_128, ain_128); - a2 = _mm256_insertf128_si256( _mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); + a2 = _mm256_insertf128_si256(_mm256_castsi128_si256(ain_128_lo), ain_128_hi, 1); c = _mm256_mullo_epi16(a2, b2); @@ -1008,12 +1017,12 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc { a2 = cacc[n_vec]; - _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (number = 0; number < 8; ++number) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } _out[n_vec] = dotProduct; } @@ -1024,7 +1033,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); (*phase) = four_phase_acc[0]; - for(n = avx2_iters * 8; n < num_points; n++) + for (n = avx2_iters * 8; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -1034,10 +1043,9 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } - } #endif /* LV_HAVE_AVX2 */ @@ -1046,178 +1054,178 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc //static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) //{ - //const unsigned int neon_iters = num_points / 4; +//const unsigned int neon_iters = num_points / 4; - //const int16_t** _in_a = in_a; - //const lv_16sc_t* _in_common = in_common; - //lv_16sc_t* _out = result; - //int n_vec; - //int i; - //unsigned int number; - //unsigned int n; - //lv_16sc_t tmp16_, tmp; - //lv_32fc_t tmp32_; +//const int16_t** _in_a = in_a; +//const lv_16sc_t* _in_common = in_common; +//lv_16sc_t* _out = result; +//int n_vec; +//int i; +//unsigned int number; +//unsigned int n; +//lv_16sc_t tmp16_, tmp; +//lv_32fc_t tmp32_; - //if (neon_iters > 0) - //{ - //lv_16sc_t dotProduct = lv_cmake(0,0); - //float arg_phase0 = cargf(*phase); - //float arg_phase_inc = cargf(phase_inc); - //float phase_est; +//if (neon_iters > 0) +//{ +//lv_16sc_t dotProduct = lv_cmake(0,0); +//float arg_phase0 = cargf(*phase); +//float arg_phase_inc = cargf(phase_inc); +//float phase_est; - //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; +//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - //float32x4_t _phase4_real = vld1q_f32(__phase4_real); - //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); +//float32x4_t _phase4_real = vld1q_f32(__phase4_real); +//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; - //lv_32fc_t phase3 = phase2 * phase_inc; - //lv_32fc_t phase4 = phase3 * phase_inc; +//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; +//lv_32fc_t phase3 = phase2 * phase_inc; +//lv_32fc_t phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //float32x4_t _phase_real = vld1q_f32(__phase_real); - //float32x4_t _phase_imag = vld1q_f32(__phase_imag); +//float32x4_t _phase_real = vld1q_f32(__phase_real); +//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - //int16x4x2_t a_val, b_val, c_val; - //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - //float32x4_t half = vdupq_n_f32(0.5f); - //int16x4x2_t tmp16; - //int32x4x2_t tmp32i; +//int16x4x2_t a_val, b_val, c_val; +//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +//float32x4_t half = vdupq_n_f32(0.5f); +//int16x4x2_t tmp16; +//int32x4x2_t tmp32i; - //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; - //float32x4_t sign, PlusHalf, Round; +//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; +//float32x4_t sign, PlusHalf, Round; - //int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); +//int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //accumulator[n_vec].val[0] = vdup_n_s16(0); - //accumulator[n_vec].val[1] = vdup_n_s16(0); - //} +//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//accumulator[n_vec].val[0] = vdup_n_s16(0); +//accumulator[n_vec].val[1] = vdup_n_s16(0); +//} - //for(number = 0; number < neon_iters; number++) - //{ - //[> load 4 complex numbers (int 16 bits each component) <] - //tmp16 = vld2_s16((int16_t*)_in_common); - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //_in_common += 4; +//for(number = 0; number < neon_iters; number++) +//{ +//[> load 4 complex numbers (int 16 bits each component) <] +//tmp16 = vld2_s16((int16_t*)_in_common); +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +//_in_common += 4; - //[> promote them to int 32 bits <] - //tmp32i.val[0] = vmovl_s16(tmp16.val[0]); - //tmp32i.val[1] = vmovl_s16(tmp16.val[1]); +//[> promote them to int 32 bits <] +//tmp32i.val[0] = vmovl_s16(tmp16.val[0]); +//tmp32i.val[1] = vmovl_s16(tmp16.val[1]); - //[> promote them to float 32 bits <] - //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); - //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); +//[> promote them to float 32 bits <] +//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); +//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - //[> complex multiplication of four complex samples (float 32 bits each component) <] - //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); - //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); - //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); - //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); +//[> complex multiplication of four complex samples (float 32 bits each component) <] +//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); +//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); +//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); +//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //[> downcast results to int32 <] - //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[0], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[0] = vcvtq_s32_f32(Round); +//[> downcast results to int32 <] +//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[0], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[0] = vcvtq_s32_f32(Round); - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[1], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[1] = vcvtq_s32_f32(Round); +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[1], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[1] = vcvtq_s32_f32(Round); - //[> downcast results to int16 <] - //tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); - //tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); +//[> downcast results to int16 <] +//tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); +//tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); - //[> compute next four phases <] - //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); - //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); - //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); - //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); +//[> compute next four phases <] +//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); +//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); +//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); +//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - ////__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg +////__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); - //// multiply the real*real and imag*imag to get real result - //// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r - //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); - //// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i - //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[1]); - //c_val.val[0] = vqsub_s16(b_val.val[0], b_val.val[1]); +//// multiply the real*real and imag*imag to get real result +//// a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r +//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); +//// a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i +//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[1]); +//c_val.val[0] = vqsub_s16(b_val.val[0], b_val.val[1]); - //// Multiply cross terms to get the imaginary result - //// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i - //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[1]); - //// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r - //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); - //c_val.val[1] = vqadd_s16(b_val.val[0], b_val.val[1]); +//// Multiply cross terms to get the imaginary result +//// a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i +//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[1]); +//// a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r +//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); +//c_val.val[1] = vqadd_s16(b_val.val[0], b_val.val[1]); - //accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], c_val.val[0]); - //accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], c_val.val[1]); - //} - //// Regenerate phase - //if ((number % 256) == 0) - //{ - //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; +//accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], c_val.val[0]); +//accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], c_val.val[1]); +//} +//// Regenerate phase +//if ((number % 256) == 0) +//{ +//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - //*phase = lv_cmake(cos(phase_est), sin(phase_est)); - //phase2 = (lv_32fc_t)(*phase) * phase_inc; - //phase3 = phase2 * phase_inc; - //phase4 = phase3 * phase_inc; +//*phase = lv_cmake(cos(phase_est), sin(phase_est)); +//phase2 = (lv_32fc_t)(*phase) * phase_inc; +//phase3 = phase2 * phase_inc; +//phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //_phase_real = vld1q_f32(____phase_real); - //_phase_imag = vld1q_f32(____phase_imag); - //} - //} +//_phase_real = vld1q_f32(____phase_real); +//_phase_imag = vld1q_f32(____phase_imag); +//} +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - //dotProduct = lv_cmake(0,0); - //for (i = 0; i < 4; ++i) - //{ - //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); - //} - //_out[n_vec] = dotProduct; - //} - //volk_gnsssdr_free(accumulator); - //vst1q_f32((float32_t*)__phase_real, _phase_real); - //vst1q_f32((float32_t*)__phase_imag, _phase_imag); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector +//dotProduct = lv_cmake(0,0); +//for (i = 0; i < 4; ++i) +//{ +//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), +//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); +//} +//_out[n_vec] = dotProduct; +//} +//volk_gnsssdr_free(accumulator); +//vst1q_f32((float32_t*)__phase_real, _phase_real); +//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); - //} +//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); +//} - //for (n = neon_iters * 4; n < num_points; n++) - //{ - //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); - //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); - //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); - //(*phase) *= phase_inc; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //tmp = tmp16_ * in_a[n_vec][n]; - //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); - //} - //} +//for (n = neon_iters * 4; n < num_points; n++) +//{ +//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); +//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); +//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); +//(*phase) *= phase_inc; +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//tmp = tmp16_ * in_a[n_vec][n]; +//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); +//} +//} //} //#endif [> LV_HAVE_NEON <] @@ -1229,186 +1237,186 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc //static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) //{ - //const unsigned int neon_iters = num_points / 4; +//const unsigned int neon_iters = num_points / 4; - //const int16_t** _in_a = in_a; - //const lv_16sc_t* _in_common = in_common; - //lv_16sc_t* _out = result; - //int n_vec; - //int i; - //unsigned int number; - //unsigned int n; - //lv_16sc_t tmp16_, tmp; - //lv_32fc_t tmp32_; +//const int16_t** _in_a = in_a; +//const lv_16sc_t* _in_common = in_common; +//lv_16sc_t* _out = result; +//int n_vec; +//int i; +//unsigned int number; +//unsigned int n; +//lv_16sc_t tmp16_, tmp; +//lv_32fc_t tmp32_; - //if (neon_iters > 0) - //{ - //lv_16sc_t dotProduct = lv_cmake(0,0); - //float arg_phase0 = cargf(*phase); - //float arg_phase_inc = cargf(phase_inc); - //float phase_est; - ////printf("arg phase0: %f", arg_phase0); - //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; +//if (neon_iters > 0) +//{ +//lv_16sc_t dotProduct = lv_cmake(0,0); +//float arg_phase0 = cargf(*phase); +//float arg_phase_inc = cargf(phase_inc); +//float phase_est; +////printf("arg phase0: %f", arg_phase0); +//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - //float32x4_t _phase4_real = vld1q_f32(__phase4_real); - //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); +//float32x4_t _phase4_real = vld1q_f32(__phase4_real); +//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; - //lv_32fc_t phase3 = phase2 * phase_inc; - //lv_32fc_t phase4 = phase3 * phase_inc; +//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; +//lv_32fc_t phase3 = phase2 * phase_inc; +//lv_32fc_t phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //float32x4_t _phase_real = vld1q_f32(__phase_real); - //float32x4_t _phase_imag = vld1q_f32(__phase_imag); +//float32x4_t _phase_real = vld1q_f32(__phase_real); +//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - //int16x4x2_t a_val, b_val; - //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - //float32x4_t half = vdupq_n_f32(0.5f); - //int16x4x2_t tmp16; - //int32x4x2_t tmp32i; +//int16x4x2_t a_val, b_val; +//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +//float32x4_t half = vdupq_n_f32(0.5f); +//int16x4x2_t tmp16; +//int32x4x2_t tmp32i; - //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; - //float32x4_t sign, PlusHalf, Round; +//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; +//float32x4_t sign, PlusHalf, Round; - //int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); +//int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //accumulator[n_vec].val[0] = vdup_n_s16(0); - //accumulator[n_vec].val[1] = vdup_n_s16(0); - //} +//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//accumulator[n_vec].val[0] = vdup_n_s16(0); +//accumulator[n_vec].val[1] = vdup_n_s16(0); +//} - //for(number = 0; number < neon_iters; number++) - //{ - //[> load 4 complex numbers (int 16 bits each component) <] - //tmp16 = vld2_s16((int16_t*)_in_common); - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //_in_common += 4; +//for(number = 0; number < neon_iters; number++) +//{ +//[> load 4 complex numbers (int 16 bits each component) <] +//tmp16 = vld2_s16((int16_t*)_in_common); +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +//_in_common += 4; - //[> promote them to int 32 bits <] - //tmp32i.val[0] = vmovl_s16(tmp16.val[0]); - //tmp32i.val[1] = vmovl_s16(tmp16.val[1]); +//[> promote them to int 32 bits <] +//tmp32i.val[0] = vmovl_s16(tmp16.val[0]); +//tmp32i.val[1] = vmovl_s16(tmp16.val[1]); - //[> promote them to float 32 bits <] - //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); - //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); +//[> promote them to float 32 bits <] +//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); +//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - //[> complex multiplication of four complex samples (float 32 bits each component) <] - //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); - //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); - //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); - //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); +//[> complex multiplication of four complex samples (float 32 bits each component) <] +//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); +//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); +//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); +//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //[> downcast results to int32 <] - //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[0], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[0] = vcvtq_s32_f32(Round); +//[> downcast results to int32 <] +//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[0], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[0] = vcvtq_s32_f32(Round); - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[1], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[1] = vcvtq_s32_f32(Round); +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[1], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[1] = vcvtq_s32_f32(Round); - //[> downcast results to int16 <] - //tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); - //tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); +//[> downcast results to int16 <] +//tmp16.val[0] = vqmovn_s32(tmp32i.val[0]); +//tmp16.val[1] = vqmovn_s32(tmp32i.val[1]); - //[> compute next four phases <] - //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); - //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); - //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); - //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); +//[> compute next four phases <] +//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); +//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); +//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); +//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //// Regenerate phase - //if ((number % 256) == 0) - //{ - ////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); - //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - ////printf("Estimated phase: %f\n\n", cos(phase_est)); +//// Regenerate phase +//if ((number % 256) == 0) +//{ +////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); +//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; +////printf("Estimated phase: %f\n\n", cos(phase_est)); - //*phase = lv_cmake(cos(phase_est), sin(phase_est)); - //phase2 = (lv_32fc_t)(*phase) * phase_inc; - //phase3 = phase2 * phase_inc; - //phase4 = phase3 * phase_inc; +//*phase = lv_cmake(cos(phase_est), sin(phase_est)); +//phase2 = (lv_32fc_t)(*phase) * phase_inc; +//phase3 = phase2 * phase_inc; +//phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //_phase_real = vld1q_f32(____phase_real); - //_phase_imag = vld1q_f32(____phase_imag); +//_phase_real = vld1q_f32(____phase_real); +//_phase_imag = vld1q_f32(____phase_imag); - //// Round = vmulq_f32(_phase_real, _phase_real); - //// Round = vmlaq_f32(Round, _phase_imag, _phase_imag); - //// Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); - ////Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); - ////Round = vrecpeq_f32((Round); - //// _phase_real = vdivq_f32(_phase_real, Round); - //// _phase_imag = vdivq_f32(_phase_imag, Round); - ////_phase_real = vmulq_f32(_phase_real, Round); - ////_phase_imag = vmulq_f32(_phase_imag, Round); - ////printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); +//// Round = vmulq_f32(_phase_real, _phase_real); +//// Round = vmlaq_f32(Round, _phase_imag, _phase_imag); +//// Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); +////Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); +////Round = vrecpeq_f32((Round); +//// _phase_real = vdivq_f32(_phase_real, Round); +//// _phase_imag = vdivq_f32(_phase_imag, Round); +////_phase_real = vmulq_f32(_phase_real, Round); +////_phase_imag = vmulq_f32(_phase_imag, Round); +////printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); - //} +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); - //b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); - //b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); +//b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); +//b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); - //// use multiply accumulate/subtract to get result - //b_val.val[0] = vmls_s16(b_val.val[0], a_val.val[1], tmp16.val[1]); - //b_val.val[1] = vmla_s16(b_val.val[1], a_val.val[0], tmp16.val[1]); +//// use multiply accumulate/subtract to get result +//b_val.val[0] = vmls_s16(b_val.val[0], a_val.val[1], tmp16.val[1]); +//b_val.val[1] = vmla_s16(b_val.val[1], a_val.val[0], tmp16.val[1]); - //accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], b_val.val[0]); - //accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], b_val.val[1]); - //} - //} +//accumulator[n_vec].val[0] = vqadd_s16(accumulator[n_vec].val[0], b_val.val[0]); +//accumulator[n_vec].val[1] = vqadd_s16(accumulator[n_vec].val[1], b_val.val[1]); +//} +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - //dotProduct = lv_cmake(0,0); - //for (i = 0; i < 4; ++i) - //{ - //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); - //} - //_out[n_vec] = dotProduct; - //} - //volk_gnsssdr_free(accumulator); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector +//dotProduct = lv_cmake(0,0); +//for (i = 0; i < 4; ++i) +//{ +//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), +//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); +//} +//_out[n_vec] = dotProduct; +//} +//volk_gnsssdr_free(accumulator); - //vst1q_f32((float32_t*)__phase_real, _phase_real); - //vst1q_f32((float32_t*)__phase_imag, _phase_imag); +//vst1q_f32((float32_t*)__phase_real, _phase_real); +//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); - //} +//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); +//} - //for (n = neon_iters * 4; n < num_points; n++) - //{ - //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); - //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); - //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); - //(*phase) *= phase_inc; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //tmp = tmp16_ * in_a[n_vec][n]; - //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); - //} - //} +//for (n = neon_iters * 4; n < num_points; n++) +//{ +//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); +//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); +//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); +//(*phase) *= phase_inc; +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//tmp = tmp16_ * in_a[n_vec][n]; +//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); +//} +//} //} //#endif [> LV_HAVE_NEON <] @@ -1420,181 +1428,179 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc //static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const int16_t** in_a, int num_a_vectors, unsigned int num_points) //{ - //const unsigned int neon_iters = num_points / 4; +//const unsigned int neon_iters = num_points / 4; - //const int16_t** _in_a = in_a; - //const lv_16sc_t* _in_common = in_common; - //lv_16sc_t* _out = result; - //int n_vec; - //int i; - //unsigned int number; - //unsigned int n; - //lv_16sc_t tmp16_, tmp; - //lv_32fc_t tmp32_; +//const int16_t** _in_a = in_a; +//const lv_16sc_t* _in_common = in_common; +//lv_16sc_t* _out = result; +//int n_vec; +//int i; +//unsigned int number; +//unsigned int n; +//lv_16sc_t tmp16_, tmp; +//lv_32fc_t tmp32_; - //if (neon_iters > 0) - //{ - //lv_16sc_t dotProduct = lv_cmake(0,0); - //float arg_phase0 = cargf(*phase); - //float arg_phase_inc = cargf(phase_inc); - //float phase_est; +//if (neon_iters > 0) +//{ +//lv_16sc_t dotProduct = lv_cmake(0,0); +//float arg_phase0 = cargf(*phase); +//float arg_phase_inc = cargf(phase_inc); +//float phase_est; - //lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; +//lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; - //float32x4_t _phase4_real = vld1q_f32(__phase4_real); - //float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); +//float32x4_t _phase4_real = vld1q_f32(__phase4_real); +//float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - //lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; - //lv_32fc_t phase3 = phase2 * phase_inc; - //lv_32fc_t phase4 = phase3 * phase_inc; +//lv_32fc_t phase2 = (lv_32fc_t)(*phase) * phase_inc; +//lv_32fc_t phase3 = phase2 * phase_inc; +//lv_32fc_t phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //float32x4_t _phase_real = vld1q_f32(__phase_real); - //float32x4_t _phase_imag = vld1q_f32(__phase_imag); +//float32x4_t _phase_real = vld1q_f32(__phase_real); +//float32x4_t _phase_imag = vld1q_f32(__phase_imag); - //int16x4x2_t a_val, b_val; - //__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; - //float32x4_t half = vdupq_n_f32(0.5f); - //int32x4x2_t tmp32i; +//int16x4x2_t a_val, b_val; +//__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; +//float32x4_t half = vdupq_n_f32(0.5f); +//int32x4x2_t tmp32i; - //float32x4x2_t tmp32f, tmp32_real, tmp32_imag; - //float32x4_t sign, PlusHalf, Round; +//float32x4x2_t tmp32f, tmp32_real, tmp32_imag; +//float32x4_t sign, PlusHalf, Round; - //int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - //int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); +//int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); +//int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - //for(n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //accumulator1[n_vec].val[0] = vdup_n_s16(0); - //accumulator1[n_vec].val[1] = vdup_n_s16(0); - //accumulator2[n_vec].val[0] = vdup_n_s16(0); - //accumulator2[n_vec].val[1] = vdup_n_s16(0); - //} +//for(n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//accumulator1[n_vec].val[0] = vdup_n_s16(0); +//accumulator1[n_vec].val[1] = vdup_n_s16(0); +//accumulator2[n_vec].val[0] = vdup_n_s16(0); +//accumulator2[n_vec].val[1] = vdup_n_s16(0); +//} - //for(number = 0; number < neon_iters; number++) - //{ - //[> load 4 complex numbers (int 16 bits each component) <] - //b_val = vld2_s16((int16_t*)_in_common); - //__VOLK_GNSSSDR_PREFETCH(_in_common + 8); - //_in_common += 4; +//for(number = 0; number < neon_iters; number++) +//{ +//[> load 4 complex numbers (int 16 bits each component) <] +//b_val = vld2_s16((int16_t*)_in_common); +//__VOLK_GNSSSDR_PREFETCH(_in_common + 8); +//_in_common += 4; - //[> promote them to int 32 bits <] - //tmp32i.val[0] = vmovl_s16(b_val.val[0]); - //tmp32i.val[1] = vmovl_s16(b_val.val[1]); +//[> promote them to int 32 bits <] +//tmp32i.val[0] = vmovl_s16(b_val.val[0]); +//tmp32i.val[1] = vmovl_s16(b_val.val[1]); - //[> promote them to float 32 bits <] - //tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); - //tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); +//[> promote them to float 32 bits <] +//tmp32f.val[0] = vcvtq_f32_s32(tmp32i.val[0]); +//tmp32f.val[1] = vcvtq_f32_s32(tmp32i.val[1]); - //[> complex multiplication of four complex samples (float 32 bits each component) <] - //tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); - //tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); - //tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); - //tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); +//[> complex multiplication of four complex samples (float 32 bits each component) <] +//tmp32_real.val[0] = vmulq_f32(tmp32f.val[0], _phase_real); +//tmp32_real.val[1] = vmulq_f32(tmp32f.val[1], _phase_imag); +//tmp32_imag.val[0] = vmulq_f32(tmp32f.val[0], _phase_imag); +//tmp32_imag.val[1] = vmulq_f32(tmp32f.val[1], _phase_real); - //tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//tmp32f.val[0] = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//tmp32f.val[1] = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //[> downcast results to int32 <] - //[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[0], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[0] = vcvtq_s32_f32(Round); +//[> downcast results to int32 <] +//[> in __aarch64__ we can do that with vcvtaq_s32_f32(ret1); vcvtaq_s32_f32(ret2); <] +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[0]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[0], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[0] = vcvtq_s32_f32(Round); - //sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); - //PlusHalf = vaddq_f32(tmp32f.val[1], half); - //Round = vsubq_f32(PlusHalf, sign); - //tmp32i.val[1] = vcvtq_s32_f32(Round); +//sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(tmp32f.val[1]), 31))); +//PlusHalf = vaddq_f32(tmp32f.val[1], half); +//Round = vsubq_f32(PlusHalf, sign); +//tmp32i.val[1] = vcvtq_s32_f32(Round); - //[> downcast results to int16 <] - //b_val.val[0] = vqmovn_s32(tmp32i.val[0]); - //b_val.val[1] = vqmovn_s32(tmp32i.val[1]); +//[> downcast results to int16 <] +//b_val.val[0] = vqmovn_s32(tmp32i.val[0]); +//b_val.val[1] = vqmovn_s32(tmp32i.val[1]); - //[> compute next four phases <] - //tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); - //tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); - //tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); - //tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); +//[> compute next four phases <] +//tmp32_real.val[0] = vmulq_f32(_phase_real, _phase4_real); +//tmp32_real.val[1] = vmulq_f32(_phase_imag, _phase4_imag); +//tmp32_imag.val[0] = vmulq_f32(_phase_real, _phase4_imag); +//tmp32_imag.val[1] = vmulq_f32(_phase_imag, _phase4_real); - //_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); - //_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); +//_phase_real = vsubq_f32(tmp32_real.val[0], tmp32_real.val[1]); +//_phase_imag = vaddq_f32(tmp32_imag.val[0], tmp32_imag.val[1]); - //// Regenerate phase - //if ((number % 256) == 0) - //{ - ////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); - //phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; - ////printf("Estimated phase: %f\n\n", cos(phase_est)); +//// Regenerate phase +//if ((number % 256) == 0) +//{ +////printf("computed phase: %f\n", cos(cargf(lv_cmake(_phase_real[0],_phase_imag[0])))); +//phase_est = arg_phase0 + (number + 1) * 4 * arg_phase_inc; +////printf("Estimated phase: %f\n\n", cos(phase_est)); - //*phase = lv_cmake(cos(phase_est), sin(phase_est)); - //phase2 = (lv_32fc_t)(*phase) * phase_inc; - //phase3 = phase2 * phase_inc; - //phase4 = phase3 * phase_inc; +//*phase = lv_cmake(cos(phase_est), sin(phase_est)); +//phase2 = (lv_32fc_t)(*phase) * phase_inc; +//phase3 = phase2 * phase_inc; +//phase4 = phase3 * phase_inc; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - //__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; +//__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; - //_phase_real = vld1q_f32(____phase_real); - //_phase_imag = vld1q_f32(____phase_imag); - //} +//_phase_real = vld1q_f32(____phase_real); +//_phase_imag = vld1q_f32(____phase_imag); +//} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); - //// use 2 accumulators to remove inter-instruction data dependencies - //accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); - //accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); - //accumulator2[n_vec].val[0] = vmls_s16(accumulator2[n_vec].val[0], a_val.val[1], b_val.val[1]); - //accumulator2[n_vec].val[1] = vmla_s16(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]); - //} - //} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]); - //accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]); - //} - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - //dotProduct = lv_cmake(0,0); - //for (i = 0; i < 4; ++i) - //{ - //dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - //sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); - //} - //_out[n_vec] = dotProduct; - //} - //volk_gnsssdr_free(accumulator1); - //volk_gnsssdr_free(accumulator2); +//// use 2 accumulators to remove inter-instruction data dependencies +//accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); +//accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); +//accumulator2[n_vec].val[0] = vmls_s16(accumulator2[n_vec].val[0], a_val.val[1], b_val.val[1]); +//accumulator2[n_vec].val[1] = vmla_s16(accumulator2[n_vec].val[1], a_val.val[1], b_val.val[0]); +//} +//} +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//accumulator1[n_vec].val[0] = vqadd_s16(accumulator1[n_vec].val[0], accumulator2[n_vec].val[0]); +//accumulator1[n_vec].val[1] = vqadd_s16(accumulator1[n_vec].val[1], accumulator2[n_vec].val[1]); +//} +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector +//dotProduct = lv_cmake(0,0); +//for (i = 0; i < 4; ++i) +//{ +//dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), +//sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); +//} +//_out[n_vec] = dotProduct; +//} +//volk_gnsssdr_free(accumulator1); +//volk_gnsssdr_free(accumulator2); - //vst1q_f32((float32_t*)__phase_real, _phase_real); - //vst1q_f32((float32_t*)__phase_imag, _phase_imag); +//vst1q_f32((float32_t*)__phase_real, _phase_real); +//vst1q_f32((float32_t*)__phase_imag, _phase_imag); - //(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); - //} +//(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); +//} - //for (n = neon_iters * 4; n < num_points; n++) - //{ - //tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); - //tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); - //tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); - //(*phase) *= phase_inc; - //for (n_vec = 0; n_vec < num_a_vectors; n_vec++) - //{ - //tmp = tmp16_ * in_a[n_vec][n]; - //_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); - //} - //} +//for (n = neon_iters * 4; n < num_points; n++) +//{ +//tmp16_ = in_common[n]; //printf("neon phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); +//tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); +//tmp16_ = lv_cmake((int16_t)rintf(lv_creal(tmp32_)), (int16_t)rintf(lv_cimag(tmp32_))); +//(*phase) *= phase_inc; +//for (n_vec = 0; n_vec < num_a_vectors; n_vec++) +//{ +//tmp = tmp16_ * in_a[n_vec][n]; +//_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); +//} +//} //} //#endif [> LV_HAVE_NEON <] #endif /*INCLUDED_volk_gnsssdr_16ic_16i_dot_prod_16ic_xn_H*/ - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h index a666c0270..6880b8d11 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic.h @@ -41,7 +41,7 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.345; @@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.345; @@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_re unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -113,50 +113,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_ unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 //#ifdef LV_HAVE_SSE3 //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // SSE3 @@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_ unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 #ifdef LV_HAVE_AVX2 @@ -206,50 +206,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_ unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 //#ifdef LV_HAVE_AVX2 //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // AVX2 @@ -268,50 +268,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_ unsigned int n; int num_a_vectors = 3; int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); } - volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 //#ifdef LV_HAVE_AVX2 //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // AVX2 @@ -320,29 +320,29 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_ //#ifdef LV_HAVE_NEON //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // NEON @@ -351,34 +351,31 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_ //#ifdef LV_HAVE_NEON //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //{ - //// phases must be normalized. Phase rotator expects a complex exponential input! - //float rem_carrier_phase_in_rad = 0.345; - //float phase_step_rad = 0.1; - //lv_32fc_t phase[1]; - //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); - //lv_32fc_t phase_inc[1]; - //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); - //unsigned int n; - //int num_a_vectors = 3; - //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - //for(n = 0; n < num_a_vectors; n++) - //{ - //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); - //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); - //} +//// phases must be normalized. Phase rotator expects a complex exponential input! +//float rem_carrier_phase_in_rad = 0.345; +//float phase_step_rad = 0.1; +//lv_32fc_t phase[1]; +//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); +//lv_32fc_t phase_inc[1]; +//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); +//unsigned int n; +//int num_a_vectors = 3; +//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); +//for(n = 0; n < num_a_vectors; n++) +//{ +//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); +//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); +//} - //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); +//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); - //for(n = 0; n < num_a_vectors; n++) - //{ - //volk_gnsssdr_free(in_a[n]); - //} - //volk_gnsssdr_free(in_a); +//for(n = 0; n < num_a_vectors; n++) +//{ +//volk_gnsssdr_free(in_a[n]); +//} +//volk_gnsssdr_free(in_a); //} //#endif // NEON #endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H - - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h index 5aae17266..b294d5ca9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_conjugate_16ic.h @@ -68,7 +68,7 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_generic(lv_16sc_t* cVector, const lv_16sc_t* aPtr = aVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = lv_conj(*aPtr++); } @@ -231,4 +231,3 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c //#endif /* LV_HAVE_NEON */ #endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h index fa9517b76..5d66452e0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_convert_32fc.h @@ -63,7 +63,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) { unsigned int i; - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); } @@ -82,9 +82,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector lv_32fc_t* _out = outputVector; __m128 a; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg _mm_store_ps((float*)_out, a); _in += 2; _out += 2; @@ -109,9 +109,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector lv_32fc_t* _out = outputVector; __m128 a; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg _mm_storeu_ps((float*)_out, a); _in += 2; _out += 2; @@ -136,15 +136,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector, lv_32fc_t* _out = outputVector; __m256 a; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg _mm256_storeu_ps((float*)_out, a); _in += 4; _out += 4; } _mm256_zeroupper(); - for(i = 0; i < (num_points % 4); ++i) + for (i = 0; i < (num_points % 4); ++i) { *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); _in++; @@ -163,15 +163,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector, lv_32fc_t* _out = outputVector; __m256 a; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg _mm256_store_ps((float*)_out, a); _in += 4; _out += 4; } _mm256_zeroupper(); - for(i = 0; i < (num_points % 4); ++i) + for (i = 0; i < (num_points % 4); ++i) { *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); _in++; @@ -194,7 +194,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector, int32x4_t a32x4; float32x4_t f32x4; - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { a16x4 = vld1_s16((const int16_t*)_in); __VOLK_GNSSSDR_PREFETCH(_in + 4); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h index 8f35d59b8..cca2efa0d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resampler_fast_16ic.h @@ -78,7 +78,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu // resample code for current tap local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f); if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips; - if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips; + if (local_code_chip_index > (code_length_chips - 1)) local_code_chip_index -= code_length_chips; result[n] = local_code[local_code_chip_index]; } } @@ -89,61 +89,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) +static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) { - _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; lv_16sc_t* _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; __m128 _rem_code_phase, _code_phase_step_chips; __m128i _code_length_chips, _code_length_chips_minus1; __m128 _code_phase_out, _code_phase_out_with_offset; rem_code_phase_chips = rem_code_phase_chips - 0.5f; - _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; - four_times_code_length_chips_minus1[0] = code_length_chips-1; - four_times_code_length_chips_minus1[1] = code_length_chips-1; - four_times_code_length_chips_minus1[2] = code_length_chips-1; - four_times_code_length_chips_minus1[3] = code_length_chips-1; + _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; + four_times_code_length_chips_minus1[0] = code_length_chips - 1; + four_times_code_length_chips_minus1[1] = code_length_chips - 1; + four_times_code_length_chips_minus1[2] = code_length_chips - 1; + four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i zero = _mm_setzero_si128(); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; __m128 _4output_index = _mm_load_ps(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; __m128 _4constant_float = _mm_load_ps(init_4constant_float); - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); - _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; @@ -154,7 +159,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul _4output_index = _mm_add_ps(_4output_index, _4constant_float); } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; @@ -169,61 +174,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) +static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) { - _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; lv_16sc_t* _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; __m128 _rem_code_phase, _code_phase_step_chips; __m128i _code_length_chips, _code_length_chips_minus1; __m128 _code_phase_out, _code_phase_out_with_offset; rem_code_phase_chips = rem_code_phase_chips - 0.5f; - _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; - four_times_code_length_chips_minus1[0] = code_length_chips-1; - four_times_code_length_chips_minus1[1] = code_length_chips-1; - four_times_code_length_chips_minus1[2] = code_length_chips-1; - four_times_code_length_chips_minus1[3] = code_length_chips-1; + _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; + four_times_code_length_chips_minus1[0] = code_length_chips - 1; + four_times_code_length_chips_minus1[1] = code_length_chips - 1; + four_times_code_length_chips_minus1[2] = code_length_chips - 1; + four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i zero = _mm_setzero_si128(); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; __m128 _4output_index = _mm_loadu_ps(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; __m128 _4constant_float = _mm_loadu_ps(init_4constant_float); - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); - _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; @@ -234,7 +244,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul _4output_index = _mm_add_ps(_4output_index, _4constant_float); } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; @@ -249,7 +259,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) +static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float) { unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; @@ -257,57 +267,62 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, lv_16sc_t* _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; float32x4_t _rem_code_phase, _code_phase_step_chips; int32x4_t _code_length_chips, _code_length_chips_minus1; float32x4_t _code_phase_out, _code_phase_out_with_offset; rem_code_phase_chips = rem_code_phase_chips - 0.5f; float32x4_t sign, PlusHalf, Round; - _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; + _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register + _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register - int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; + int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; uint32x4_t negative_indexes, overflow_indexes; int32x4_t zero = vmovq_n_s32(0); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; float32x4_t _4output_index = vld1q_f32(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; float32x4_t _4constant_float = vld1q_f32(init_4constant_float); - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { - _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step - _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); Round = vsubq_f32(PlusHalf, sign); _code_phase_out_int = vcvtq_s32_f32(Round); - negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); - vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible *_result++ = local_code[local_code_chip_index[0]]; @@ -318,7 +333,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, _4output_index = vaddq_f32(_4output_index, _4constant_float); } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastpuppet_16ic.h index 0b67ce73c..038e70108 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastpuppet_16ic.h @@ -44,7 +44,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_generic(lv_16sc_t* float rem_code_phase_chips = -0.123; float code_phase_step_chips = 0.1; int code_length_chips = 1023; - volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); + volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); } #endif /* LV_HAVE_GENERIC */ @@ -55,7 +55,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_a_sse2(lv_16sc_t* float rem_code_phase_chips = -0.123; float code_phase_step_chips = 0.1; int code_length_chips = 1023; - volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); + volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); } #endif /* LV_HAVE_SSE2 */ @@ -67,7 +67,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_u_sse2(lv_16sc_t* float rem_code_phase_chips = -0.123; float code_phase_step_chips = 0.1; int code_length_chips = 1023; - volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); + volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); } #endif /* LV_HAVE_SSE2 */ @@ -79,9 +79,9 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_neon(lv_16sc_t* re float rem_code_phase_chips = -0.123; float code_phase_step_chips = 0.1; int code_length_chips = 1023; - volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); + volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); } #endif /* LV_HAVE_NEON */ -#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H +#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic.h index bc4c2faa8..934af8e88 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic.h @@ -49,21 +49,21 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_generic(lv_16sc_ int num_out_vectors = 3; unsigned int n; float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - rem_code_phase_chips[n] = -0.234; - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); volk_gnsssdr_free(rem_code_phase_chips); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -77,22 +77,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_a_sse2(lv_16sc_t int code_length_chips = 2046; int num_out_vectors = 3; unsigned int n; - float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - rem_code_phase_chips[n] = -0.234; - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); volk_gnsssdr_free(rem_code_phase_chips); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -106,22 +106,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_u_sse2(lv_16sc_t int code_length_chips = 2046; int num_out_vectors = 3; unsigned int n; - float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - rem_code_phase_chips[n] = -0.234; - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); volk_gnsssdr_free(rem_code_phase_chips); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -135,26 +135,26 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_neon(lv_16sc_t* int code_length_chips = 2046; int num_out_vectors = 3; unsigned int n; - float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - rem_code_phase_chips[n] = -0.234; - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + rem_code_phase_chips[n] = -0.234; + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); volk_gnsssdr_free(rem_code_phase_chips); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H +#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h index 85e6fcb08..4582d6961 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_resamplerxnpuppet_16ic.h @@ -45,56 +45,56 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; unsigned int n; float rem_code_phase_chips = -0.234; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif /* LV_HAVE_GENERIC */ - + #ifdef LV_HAVE_SSE3 static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -104,26 +104,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -134,26 +134,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -164,26 +164,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -194,26 +194,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -224,26 +224,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -254,29 +254,29 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; - lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + float shifts_chips[3] = {-0.1, 0.0, 0.1}; + lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - } + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H +#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h index 15303ead5..0de39ebc3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_s32fc_x2_rotator_16ic.h @@ -70,7 +70,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou unsigned int i = 0; lv_16sc_t tmp16; lv_32fc_t tmp32; - for(i = 0; i < (unsigned int)(num_points); ++i) + for (i = 0; i < (unsigned int)(num_points); ++i) { tmp16 = *inVector++; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -111,8 +111,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s *outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; } - // Regenerate phase - //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); + // Regenerate phase + //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -141,11 +141,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out unsigned int number; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -157,49 +159,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_store_si128((__m128i*)_out, result); // Regenerate phase @@ -232,7 +234,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out #endif /* LV_HAVE_SSE3 */ - #ifdef LV_HAVE_SSE3 #include @@ -244,11 +245,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc unsigned int j; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -265,47 +268,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc { for (j = 0; j < ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_store_si128((__m128i*)_out, result); //next two samples @@ -322,47 +325,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_store_si128((__m128i*)_out, result); //next two samples @@ -385,7 +388,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc #endif /* LV_HAVE_SSE3 */ - #ifdef LV_HAVE_SSE3 #include @@ -395,14 +397,16 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out unsigned int number; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; - two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); const lv_16sc_t* _in = inVector; @@ -412,49 +416,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _mm_storeu_si128((__m128i*)_out, result); // Regenerate phase @@ -493,147 +497,149 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) { const unsigned int sse_iters = num_points / 4; - unsigned int ROTATOR_RELOAD = 512; - unsigned int n; - unsigned int j; - __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; - __m128i c1, c2, result; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; - two_phase_inc[0] = phase_inc * phase_inc; - two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; - two_phase_acc[0] = (*phase); - two_phase_acc[1] = (*phase) * phase_inc; - two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + unsigned int ROTATOR_RELOAD = 512; + unsigned int n; + unsigned int j; + __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; + __m128i c1, c2, result; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; + two_phase_inc[0] = phase_inc * phase_inc; + two_phase_inc[1] = phase_inc * phase_inc; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; + two_phase_acc[0] = (*phase); + two_phase_acc[1] = (*phase) * phase_inc; + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); - const lv_16sc_t* _in = inVector; + const lv_16sc_t* _in = inVector; - lv_16sc_t* _out = outVector; + lv_16sc_t* _out = outVector; - __m128 yl, yh, tmp1, tmp2, tmp3; - lv_16sc_t tmp16; - lv_32fc_t tmp32; + __m128 yl, yh, tmp1, tmp2, tmp3; + lv_16sc_t tmp16; + lv_32fc_t tmp32; - for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++) - { - for (j = 0; j < ROTATOR_RELOAD; j++) - { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++) + { + for (j = 0; j < ROTATOR_RELOAD; j++) + { + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples - _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + //next two samples + _in += 2; + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __VOLK_GNSSSDR_PREFETCH(_in + 8); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic - _mm_storeu_si128((__m128i*)_out, result); + // store four output samples + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic + _mm_storeu_si128((__m128i*)_out, result); - //next two samples - _in += 2; - _out += 4; - } - // Regenerate phase - tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); - tmp2 = _mm_hadd_ps(tmp1, tmp1); - tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); - tmp2 = _mm_sqrt_ps(tmp1); - two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); - } + //next two samples + _in += 2; + _out += 4; + } + // Regenerate phase + tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); + tmp2 = _mm_hadd_ps(tmp1, tmp1); + tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); + tmp2 = _mm_sqrt_ps(tmp1); + two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); + } - for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) - { - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) + { + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - //next two samples - _in += 2; - a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg - __VOLK_GNSSSDR_PREFETCH(_in + 8); - //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + //next two samples + _in += 2; + a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + __VOLK_GNSSSDR_PREFETCH(_in + 8); + //complex 32fc multiplication b=a*two_phase_acc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic - //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - // store four output samples - result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic - _mm_storeu_si128((__m128i*)_out, result); + // store four output samples + result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic + _mm_storeu_si128((__m128i*)_out, result); - //next two samples - _in += 2; - _out += 4; - } + //next two samples + _in += 2; + _out += 4; + } - _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); - (*phase) = two_phase_acc[0]; + _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); + (*phase) = two_phase_acc[0]; - for (n = sse_iters * 4; n < num_points; ++n) - { - tmp16 = *_in++; - tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); - *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); - (*phase) *= phase_inc; - } + for (n = sse_iters * 4; n < num_points; ++n) + { + tmp16 = *_in++; + tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); + *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); + (*phase) *= phase_inc; + } } #endif /* LV_HAVE_SSE3 */ @@ -657,8 +663,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe lv_16sc_t* _out = outVector; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -667,8 +675,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); @@ -681,7 +691,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe if (neon_iters > 0) { - for(; i < neon_iters; ++i) + for (; i < neon_iters; ++i) { /* load 4 complex numbers (int 16 bits each component) */ tmp16 = vld2_s16((int16_t*)_in); @@ -745,8 +755,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -757,7 +769,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe (*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); } - for(i = 0; i < neon_iters % 4; ++i) + for (i = 0; i < neon_iters % 4; ++i) { tmp16_ = *_in++; tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); @@ -791,8 +803,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t lv_16sc_t* _out = outVector; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -801,8 +815,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); @@ -879,8 +895,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -945,7 +963,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t (*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); } - for(i = 0; i < neon_iters % 4; ++i) + for (i = 0; i < neon_iters % 4; ++i) { tmp16_ = *_in++; tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h index 7f6219468..313824556 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic.h @@ -73,7 +73,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result, for (n = 0; n < num_points; n++) { lv_16sc_t tmp = in_a[n] * in_b[n]; - result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) )); + result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp))); } } @@ -96,7 +96,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con if (sse_iters > 0) { __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); @@ -104,25 +105,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_a + 8); b = _mm_load_si128((__m128i*)_in_b); __VOLK_GNSSSDR_PREFETCH(_in_b + 8); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! realcacc = _mm_adds_epi16(realcacc, real); imagcacc = _mm_adds_epi16(imagcacc, imag); @@ -136,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con a = _mm_or_si128(realcacc, imagcacc); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector for (number = 0; number < 4; ++number) { @@ -174,7 +175,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con if (sse_iters > 0) { __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); @@ -182,27 +184,27 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] //imaginery part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_a + 8); b = _mm_loadu_si128((__m128i*)_in_b); __VOLK_GNSSSDR_PREFETCH(_in_b + 8); - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... - imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! realcacc = _mm_adds_epi16(realcacc, real); imagcacc = _mm_adds_epi16(imagcacc, imag); @@ -216,7 +218,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con result = _mm_or_si128(realcacc, imagcacc); - _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector + _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector for (i = 0; i < 4; ++i) { @@ -253,7 +255,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con if (avx_iters > 0) { __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; realcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256(); @@ -261,7 +264,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { a = _mm256_loadu_si256((__m256i*)_in_a); __VOLK_GNSSSDR_PREFETCH(_in_a + 16); @@ -269,7 +272,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con __VOLK_GNSSSDR_PREFETCH(_in_b + 16); c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); b_sl = _mm256_slli_si256(b, 2); @@ -278,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con imag1 = _mm256_mullo_epi16(a, b_sl); imag2 = _mm256_mullo_epi16(b, a_sl); - imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! realcacc = _mm256_adds_epi16(realcacc, real); imagcacc = _mm256_adds_epi16(imagcacc, imag); @@ -292,7 +295,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con result = _mm256_or_si256(realcacc, imagcacc); - _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector + _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector _mm256_zeroupper(); for (i = 0; i < 8; ++i) @@ -330,7 +333,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con if (avx_iters > 0) { __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; realcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256(); @@ -338,7 +342,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { a = _mm256_load_si256((__m256i*)_in_a); __VOLK_GNSSSDR_PREFETCH(_in_a + 16); @@ -346,7 +350,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con __VOLK_GNSSSDR_PREFETCH(_in_b + 16); c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); b_sl = _mm256_slli_si256(b, 2); @@ -355,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con imag1 = _mm256_mullo_epi16(a, b_sl); imag2 = _mm256_mullo_epi16(b, a_sl); - imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! + imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! realcacc = _mm256_adds_epi16(realcacc, real); imagcacc = _mm256_adds_epi16(imagcacc, imag); @@ -369,7 +373,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con result = _mm256_or_si256(realcacc, imagcacc); - _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector + _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector _mm256_zeroupper(); for (i = 0; i < 8; ++i) @@ -397,8 +401,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; *out = lv_cmake((int16_t)0, (int16_t)0); if (quarter_points > 0) @@ -407,15 +411,16 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const // 2nd lane holds the imaginary part int16x4x2_t a_val, b_val, c_val, accumulator; int16x4x2_t tmp_real, tmp_imag; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t accum_result[4]; accumulator.val[0] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0); lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); - for(number = 0; number < quarter_points; ++number) + for (number = 0; number < quarter_points; ++number) { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i __VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8); @@ -451,7 +456,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const } // tail case - for(number = quarter_points * 4; number < num_points; ++number) + for (number = quarter_points * 4; number < num_points; ++number) { *out += (*a_ptr++) * (*b_ptr++); } @@ -468,20 +473,21 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part int16x4x2_t a_val, b_val, accumulator; int16x4x2_t tmp; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t accum_result[4]; accumulator.val[0] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0); - for(number = 0; number < quarter_points; ++number) + for (number = 0; number < quarter_points; ++number) { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i __VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8); @@ -503,7 +509,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points * 4; number < num_points; ++number) + for (number = quarter_points * 4; number < num_points; ++number) { *out += (*a_ptr++) * (*b_ptr++); } @@ -520,22 +526,23 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out unsigned int quarter_points = num_points / 4; unsigned int number; - lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part int16x4x2_t a_val, b_val, accumulator1, accumulator2; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t accum_result[4]; accumulator1.val[0] = vdup_n_s16(0); accumulator1.val[1] = vdup_n_s16(0); accumulator2.val[0] = vdup_n_s16(0); accumulator2.val[1] = vdup_n_s16(0); - for(number = 0; number < quarter_points; ++number) + for (number = 0; number < quarter_points; ++number) { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i __VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8); @@ -556,7 +563,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; // tail case - for(number = quarter_points * 4; number < num_points; ++number) + for (number = quarter_points * 4; number < num_points; ++number) { *out += (*a_ptr++) * (*b_ptr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h index c1beceead..065fc75a8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dot_prod_16ic_xn.h @@ -74,7 +74,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); for (n = 0; n < num_points; n++) { //r*a.r - i*a.i, i*a.r + r*a.i @@ -96,11 +96,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t* unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); for (n = 0; n < num_points; n++) { - lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))), - sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n])))); + lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), -sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))), + sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } @@ -112,9 +112,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t* #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); int n_vec; unsigned int index; const unsigned int sse_iters = num_points / 4; @@ -125,7 +125,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul if (sse_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -141,25 +142,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(index = 0; index < sse_iters; index++) + for (index = 0; index < sse_iters; index++) { // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... - c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... + c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... + c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... - c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... + c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... + imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm_adds_epi16(c, imag); @@ -176,12 +177,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -191,12 +192,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = sse_iters * 4; index < num_points; index++) + for (index = sse_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -206,9 +207,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); int n_vec; unsigned int index; const unsigned int sse_iters = num_points / 4; @@ -219,7 +220,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul if (sse_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -235,25 +237,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(index = 0; index < sse_iters; index++) + for (index = 0; index < sse_iters; index++) { // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... - c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... + c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... + c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... - c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... + c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... + imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm_adds_epi16(c, imag); @@ -270,12 +272,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -285,12 +287,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = sse_iters * 4; index < num_points; index++) + for (index = sse_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -300,9 +302,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul #ifdef LV_HAVE_AVX2 #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); int n_vec; unsigned int index; const unsigned int sse_iters = num_points / 8; @@ -313,7 +315,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul if (sse_iters > 0) { - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -329,24 +332,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(index = 0; index < sse_iters; index++) + for (index = 0; index < sse_iters; index++) { b = _mm256_load_si256((__m256i*)_in_common); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index*8])); + a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index * 8])); c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); - c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... + c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... - c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... + c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm256_adds_epi16(c, imag); @@ -363,12 +366,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); - _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 8; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -379,12 +382,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = sse_iters * 8; index < num_points; index++) + for (index = sse_iters * 8; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -394,9 +397,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul #ifdef LV_HAVE_AVX2 #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 8; int n_vec; @@ -407,7 +410,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul if (sse_iters > 0) { - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -423,24 +427,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(index = 0; index < sse_iters; index++) + for (index = 0; index < sse_iters; index++) { b = _mm256_loadu_si256((__m256i*)_in_common); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index*8])); + a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index * 8])); c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); - c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... + c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... - c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... + c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm256_adds_epi16(c, imag); @@ -457,12 +461,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); - _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 8; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -473,12 +477,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = sse_iters * 8; index < num_points; index++) + for (index = sse_iters * 8; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -488,9 +492,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); int n_vec; unsigned int index; const unsigned int neon_iters = num_points / 4; @@ -501,7 +505,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, if (neon_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; int16x4x2_t a_val, b_val, c_val; @@ -509,19 +514,19 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, int16x4x2_t tmp_real, tmp_imag; - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0); } - for(index = 0; index < neon_iters; index++) + for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8); // multiply the real*real and imag*imag to get real result @@ -547,12 +552,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -561,12 +566,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = neon_iters * 4; index < num_points; index++) + for (index = neon_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -576,9 +581,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int neon_iters = num_points / 4; int n_vec; @@ -589,25 +594,26 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res if (neon_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; int16x4x2_t a_val, b_val, tmp; int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0); } - for(index = 0; index < neon_iters; index++) + for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); @@ -624,12 +630,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -638,12 +644,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = neon_iters * 4; index < num_points; index++) + for (index = neon_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -653,9 +659,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int neon_iters = num_points / 4; int n_vec; @@ -666,14 +672,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* if (neon_iters > 0) { - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; int16x4x2_t a_val, b_val; int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator1[n_vec].val[0] = vdup_n_s16(0); accumulator1[n_vec].val[1] = vdup_n_s16(0); @@ -681,13 +688,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* accumulator2[n_vec].val[1] = vdup_n_s16(0); } - for(index = 0; index < neon_iters; index++) + for (index = 0; index < neon_iters; index++) { - b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); @@ -705,12 +712,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (index = 0; index < 4; ++index) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); } _out[n_vec] = dotProduct; } @@ -720,12 +727,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - for(index = neon_iters * 4; index < num_points; index++) + for (index = neon_iters * 4; index < num_points; index++) { lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h index 549fff25d..ad2ec4a77 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic.h @@ -47,22 +47,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t* int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif /* Generic */ +#endif /* Generic */ #ifdef LV_HAVE_GENERIC @@ -71,22 +71,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic_sat(lv_16sc int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif /* Generic */ +#endif /* Generic */ #ifdef LV_HAVE_SSE2 @@ -95,18 +95,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } @@ -120,18 +120,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } @@ -145,18 +145,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* r int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } @@ -170,18 +170,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* r int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } @@ -195,22 +195,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #ifdef LV_HAVE_NEON @@ -220,22 +220,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #ifdef LV_HAVE_NEON @@ -244,23 +244,21 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); unsigned int n; - for(n = 0; n < num_a_vectors; n++) - { - in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); - memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); - } + for (n = 0; n < num_a_vectors; n++) + { + in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); + memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); + } - volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) - { - volk_gnsssdr_free(in_a[n]); - } + for (n = 0; n < num_a_vectors; n++) + { + volk_gnsssdr_free(in_a[n]); + } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h index 2f1036953..596c13bf5 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_multiply_16ic.h @@ -91,29 +91,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] //imaginery part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_load_si128((__m128i*)_in_b); - c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16 (c, c_sr); - real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm_subs_epi16(c, c_sr); + real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); - imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... - result = _mm_or_si128 (real, imag); + result = _mm_or_si128(real, imag); _mm_store_si128((__m128i*)_out, result); @@ -137,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con { const unsigned int sse_iters = num_points / 4; unsigned int number; - __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; + __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result; mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); @@ -145,29 +145,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { //std::complex memory structure: real part -> reinterpret_cast(a)[2*i] //imaginery part -> reinterpret_cast(a)[2*i + 1] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] - a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_loadu_si128((__m128i*)_in_b); - c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. - real = _mm_subs_epi16 (c, c_sr); - real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + real = _mm_subs_epi16(c, c_sr); + real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); - imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... - result = _mm_or_si128 (real, imag); + result = _mm_or_si128(real, imag); _mm_storeu_si128((__m128i*)_out, result); @@ -196,29 +196,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; + __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(;number < avx2_points; number++) + for (; number < avx2_points; number++) { - a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi - b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di + a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi + b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); - real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm256_adds_epi16(imag1, imag2); - imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... result = _mm256_or_si256(real, imag); @@ -230,7 +230,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con } _mm256_zeroupper(); number = avx2_points * 8; - for(;number < num_points; number++) + for (; number < num_points; number++) { *_out++ = (*_in_a++) * (*_in_b++); } @@ -250,29 +250,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con const lv_16sc_t* _in_b = in_b; lv_16sc_t* _out = out; - __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; + __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); - for(;number < avx2_points; number++) + for (; number < avx2_points; number++) { - a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi - b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di + a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi + b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di c = _mm256_mullo_epi16(a, b); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); - real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i + real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i - b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... - a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... + b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... + a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... - imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm256_adds_epi16(imag1, imag2); - imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... + imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... result = _mm256_or_si256(real, imag); @@ -284,7 +284,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con } _mm256_zeroupper(); number = avx2_points * 8; - for(;number < num_points; number++) + for (; number < num_points; number++) { *_out++ = (*_in_a++) * (*_in_b++); } @@ -292,23 +292,22 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con #endif /* LV_HAVE_AVX2 */ - #ifdef LV_HAVE_NEON #include static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) { - lv_16sc_t *a_ptr = (lv_16sc_t*) in_a; - lv_16sc_t *b_ptr = (lv_16sc_t*) in_b; + lv_16sc_t* a_ptr = (lv_16sc_t*)in_a; + lv_16sc_t* b_ptr = (lv_16sc_t*)in_b; unsigned int quarter_points = num_points / 4; int16x4x2_t a_val, b_val, c_val; int16x4x2_t tmp_real, tmp_imag; unsigned int number = 0; - for(number = 0; number < quarter_points; ++number) + for (number = 0; number < quarter_points; ++number) { - a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i - b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i + a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i + b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i __VOLK_GNSSSDR_PREFETCH(a_ptr + 4); __VOLK_GNSSSDR_PREFETCH(b_ptr + 4); @@ -334,7 +333,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const out += 4; } - for(number = quarter_points * 4; number < num_points; number++) + for (number = quarter_points * 4; number < num_points; number++) { *out++ = (*a_ptr++) * (*b_ptr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h index 0cfc9df61..60b5b7b38 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn.h @@ -85,11 +85,11 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(lv_16sc unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points; n++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); @@ -130,14 +130,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( const unsigned int ROTATOR_RELOAD = 256; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) { for (j = 0; j < ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; @@ -148,7 +148,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } } - /* Regenerate phase */ + /* Regenerate phase */ #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -159,13 +159,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( for (j = 0; j < num_points % ROTATOR_RELOAD; j++) { - tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp16 = *in_common++; //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp16 = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); (*phase) *= phase_inc; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - lv_16sc_t tmp = tmp16 * in_a[n_vec][ (num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j ]; + lv_16sc_t tmp = tmp16 * in_a[n_vec][(num_points / ROTATOR_RELOAD) * ROTATOR_RELOAD + j]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); } @@ -178,9 +178,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload( #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; int n_vec; @@ -191,7 +191,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -210,11 +211,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -222,69 +225,69 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); @@ -309,12 +312,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -331,7 +334,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -343,7 +346,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -353,9 +356,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_ #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; const unsigned int ROTATOR_RELOAD = 128; @@ -369,7 +372,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -388,11 +392,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -400,71 +406,71 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l lv_16sc_t tmp16; lv_32fc_t tmp32; - for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) + for (number = 0; number < sse_iters / ROTATOR_RELOAD; ++number) { for (j = 0; j < ROTATOR_RELOAD; j++) { // Phase rotation on operand in_common starts here: //printf("generic phase %i: %f,%f\n", n*4,lv_creal(*phase),lv_cimag(*phase)); - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][(number * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); @@ -482,65 +488,65 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) { - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_load_si128((__m128i*)&(_in_a[n_vec][((sse_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); @@ -556,12 +562,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -579,7 +585,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l //(*phase) = lv_cmake((float*)two_phase_acc[0], (float*)two_phase_acc[1]); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; //printf("a_sse phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -591,10 +597,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; //lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(tmp16), lv_cimag(in_a[n_vec][n]))) , sat_adds16i(sat_muls16i(lv_creal(tmp16), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(tmp16), lv_creal(in_a[n_vec][n])))); _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } - } #endif /* LV_HAVE_SSE3 */ @@ -602,9 +607,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); const unsigned int sse_iters = num_points / 4; int n_vec; @@ -615,7 +620,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ const lv_16sc_t* _in_common = in_common; lv_16sc_t* _out = result; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); @@ -634,11 +640,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ // phase rotation registers __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128i pc1, pc2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_loadu_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_loadu_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_loadu_ps((float*)two_phase_acc); @@ -646,69 +654,69 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ lv_16sc_t tmp16; lv_32fc_t tmp32; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc1 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + pa = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 8); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(pa, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + pa = _mm_shuffle_ps(pa, pa, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(pa, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + pb = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + pc2 = _mm_cvtps_epi32(pb); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four rotated in_common samples in the register b - b = _mm_packs_epi32(pc1, pc2);// convert from 32ic to 16ic + b = _mm_packs_epi32(pc1, pc2); // convert from 32ic to 16ic //next two samples _in_common += 2; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg - c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... + c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... - c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm_subs_epi16(c, c_sr); - b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... - a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... + b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... + a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... - imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... - imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... + imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... + imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag = _mm_adds_epi16(imag1, imag2); @@ -733,12 +741,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); - _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (j = 0; j < 4; ++j) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[j])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j]))); } _out[n_vec] = dotProduct; } @@ -748,7 +756,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 4; n < num_points; n++) + for (n = sse_iters * 4; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -758,7 +766,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -768,7 +776,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_ #ifdef LV_HAVE_AVX2 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int avx2_iters = num_points / 8; const lv_16sc_t** _in_a = in_a; @@ -781,8 +789,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ lv_16sc_t tmp16; lv_32fc_t tmp32; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - lv_16sc_t dotProduct = lv_cmake(0,0); + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0, 0); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -798,104 +807,106 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result1, result2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; - two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); __m256i a2, b2, c, c_sr, real, imag; __m128 yl, yh, tmp1, tmp2, tmp3; - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result2 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) @@ -904,7 +915,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ c = _mm256_mullo_epi16(a2, b2); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); c_sr = _mm256_slli_si256(b2, 2); @@ -936,12 +947,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ a2 = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); - _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (number = 0; number < 8; ++number) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[number])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[number]))); } _out[n_vec] = dotProduct; } @@ -953,7 +964,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = avx2_iters * 8; n < num_points; n++) + for (n = avx2_iters * 8; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -963,10 +974,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } - } #endif /* LV_HAVE_AVX2 */ @@ -974,7 +984,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_ #ifdef LV_HAVE_AVX2 #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int avx2_iters = num_points / 8; const unsigned int ROTATOR_RELOAD = 128; @@ -989,8 +999,9 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l lv_16sc_t tmp16; lv_32fc_t tmp32; - __VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; - lv_16sc_t dotProduct = lv_cmake(0,0); + __VOLK_ATTR_ALIGNED(32) + lv_16sc_t dotProductVector[8]; + lv_16sc_t dotProduct = lv_cmake(0, 0); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); @@ -1006,106 +1017,108 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l __m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128i c1, c2, result1, result2; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; - two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); + two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); __m256i a2, b2, c, c_sr, real, imag; __m128 yl, yh, tmp1, tmp2, tmp3; - for (number = 0; number < avx2_iters / ROTATOR_RELOAD; ++number) + for (number = 0; number < avx2_iters / ROTATOR_RELOAD; ++number) { for (j = 0; j < ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result2 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) @@ -1114,7 +1127,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l c = _mm256_mullo_epi16(a2, b2); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); c_sr = _mm256_slli_si256(b2, 2); @@ -1139,98 +1152,98 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l for (j = 0; j < avx2_iters % ROTATOR_RELOAD; j++) { - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result1 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result1 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di //next two samples _in_common += 2; - a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg + a = _mm_set_ps((float)(lv_cimag(_in_common[1])), (float)(lv_creal(_in_common[1])), (float)(lv_cimag(_in_common[0])), (float)(lv_creal(_in_common[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg __VOLK_GNSSSDR_PREFETCH(_in_common + 16); //complex 32fc multiplication b=a*two_phase_acc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di - c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr - yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di - tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr - tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br - tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di - two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di + tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr + tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br + tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di + two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di // store four output samples - result2 = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic + result2 = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic _in_common += 2; b2 = _mm256_insertf128_si256(_mm256_castsi128_si256(result1), (result2), 1); for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][((avx2_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 8])); + a2 = _mm256_load_si256((__m256i*)&(_in_a[n_vec][((avx2_iters / ROTATOR_RELOAD) * ROTATOR_RELOAD + j) * 8])); c = _mm256_mullo_epi16(a2, b2); - c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. + c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. real = _mm256_subs_epi16(c, c_sr); c_sr = _mm256_slli_si256(b2, 2); @@ -1253,12 +1266,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l a2 = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); - _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_si256((__m256i*)dotProductVector, a2); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (j = 0; j < 8; ++j) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[j])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[j]))); } _out[n_vec] = dotProduct; } @@ -1269,7 +1282,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = avx2_iters * 8; n < num_points; n++) + for (n = avx2_iters * 8; n < num_points; n++) { tmp16 = in_common[n]; tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); @@ -1279,7 +1292,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l { lv_16sc_t tmp = tmp16 * in_a[n_vec][n]; _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), - sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); + sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); } } } @@ -1290,7 +1303,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int neon_iters = num_points / 4; @@ -1306,14 +1319,16 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* if (neon_iters > 0) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); float arg_phase0 = cargf(*phase); float arg_phase_inc = cargf(phase_inc); float phase_est; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -1322,14 +1337,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); int16x4x2_t a_val, b_val, c_val; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; float32x4_t half = vdupq_n_f32(0.5f); int16x4x2_t tmp16; int32x4x2_t tmp32i; @@ -1339,13 +1357,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0); } - for(number = 0; number < neon_iters; number++) + for (number = 0; number < neon_iters; number++) { /* load 4 complex numbers (int 16 bits each component) */ tmp16 = vld2_s16((int16_t*)_in_common); @@ -1396,7 +1414,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][number*4] + 8); // multiply the real*real and imag*imag to get real result @@ -1426,8 +1444,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -1436,12 +1456,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -1473,7 +1493,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t* #include #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int neon_iters = num_points / 4; @@ -1489,14 +1509,16 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s if (neon_iters > 0) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); float arg_phase0 = cargf(*phase); float arg_phase_inc = cargf(phase_inc); float phase_est; //printf("arg phase0: %f", arg_phase0); lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -1505,14 +1527,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); int16x4x2_t a_val, b_val; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; float32x4_t half = vdupq_n_f32(0.5f); int16x4x2_t tmp16; int32x4x2_t tmp32i; @@ -1522,13 +1547,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0); } - for(number = 0; number < neon_iters; number++) + for (number = 0; number < neon_iters; number++) { /* load 4 complex numbers (int 16 bits each component) */ tmp16 = vld2_s16((int16_t*)_in_common); @@ -1589,8 +1614,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -1598,19 +1625,18 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s // Round = vmulq_f32(_phase_real, _phase_real); // Round = vmlaq_f32(Round, _phase_imag, _phase_imag); // Round = vsqrtq_f32(Round);//printf("sqrt: %f \n", Round[0]); - //Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); + //Round = vrsqrteq_f32(Round);printf("1/sqtr: %f \n",Round[0]); //Round = vrecpeq_f32((Round); // _phase_real = vdivq_f32(_phase_real, Round); // _phase_imag = vdivq_f32(_phase_imag, Round); //_phase_real = vmulq_f32(_phase_real, Round); //_phase_imag = vmulq_f32(_phase_imag, Round); //printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); - } for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); b_val.val[0] = vmul_s16(a_val.val[0], tmp16.val[0]); b_val.val[1] = vmul_s16(a_val.val[1], tmp16.val[0]); @@ -1626,12 +1652,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -1664,7 +1690,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s #include #include -static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) { const unsigned int neon_iters = num_points / 4; @@ -1680,14 +1706,16 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ if (neon_iters > 0) { - lv_16sc_t dotProduct = lv_cmake(0,0); + lv_16sc_t dotProduct = lv_cmake(0, 0); float arg_phase0 = cargf(*phase); float arg_phase_inc = cargf(phase_inc); float phase_est; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); @@ -1696,14 +1724,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); int16x4x2_t a_val, b_val; - __VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(16) + lv_16sc_t dotProductVector[4]; float32x4_t half = vdupq_n_f32(0.5f); int32x4x2_t tmp32i; @@ -1713,7 +1744,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator1[n_vec].val[0] = vdup_n_s16(0); accumulator1[n_vec].val[1] = vdup_n_s16(0); @@ -1721,7 +1752,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ accumulator2[n_vec].val[1] = vdup_n_s16(0); } - for(number = 0; number < neon_iters; number++) + for (number = 0; number < neon_iters; number++) { /* load 4 complex numbers (int 16 bits each component) */ b_val = vld2_s16((int16_t*)_in_common); @@ -1782,8 +1813,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -1791,7 +1824,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number*4])); + a_val = vld2_s16((int16_t*)&(_in_a[n_vec][number * 4])); // use 2 accumulators to remove inter-instruction data dependencies accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); @@ -1807,12 +1840,12 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ } for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[i])), - sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); + sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[i]))); } _out[n_vec] = dotProduct; } @@ -1842,4 +1875,3 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_ #endif /* LV_HAVE_NEON */ #endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h index cf002bf6c..9b30bdbbd 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic.h @@ -41,7 +41,7 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.345; @@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_ unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_ #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.345; @@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_rel unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -113,22 +113,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1 unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 #ifdef LV_HAVE_SSE3 @@ -144,22 +144,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3_relo unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 #ifdef LV_HAVE_SSE3 @@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1 unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // SSE3 +#endif // SSE3 #ifdef LV_HAVE_AVX2 @@ -206,22 +206,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2(lv_1 unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 #ifdef LV_HAVE_AVX2 @@ -237,22 +237,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2_relo unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 #ifdef LV_HAVE_AVX2 @@ -268,22 +268,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2(lv_1 unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 #ifdef LV_HAVE_AVX2 @@ -299,22 +299,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_relo unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // AVX2 +#endif // AVX2 #ifdef LV_HAVE_NEON @@ -330,22 +330,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16s unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #ifdef LV_HAVE_NEON @@ -361,23 +361,21 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv unsigned int n; int num_a_vectors = 3; lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); } - volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } volk_gnsssdr_free(in_a); } -#endif // NEON +#endif // NEON #endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h index 843fa8ed2..661f4ace9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_16ic_xn.h @@ -106,7 +106,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -120,7 +121,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -138,13 +139,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -156,7 +157,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r } } -#endif +#endif #ifdef LV_HAVE_SSE4_1 @@ -172,7 +173,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -186,7 +188,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -204,13 +206,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -239,7 +241,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -253,7 +256,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -274,13 +277,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -309,7 +312,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -323,7 +327,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -344,13 +348,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -378,7 +382,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -393,7 +398,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -411,13 +416,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -427,7 +432,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -455,7 +460,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -470,7 +476,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -488,13 +494,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -504,7 +510,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); @@ -530,7 +536,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int32_t local_code_chip_index[4]; int32_t local_code_chip_index_; const int32x4_t zeros = vdupq_n_s32(0); @@ -538,11 +545,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); int32x4_t local_code_chip_index_reg, aux_i, negatives, i; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; - __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f}; uint32x4_t igx; reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! float32x4_t n0 = vld1q_f32((float*)vec); int current_correlator_tap; unsigned int n; @@ -552,7 +560,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < neon_iters; n++) + for (n = 0; n < neon_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); @@ -568,7 +576,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul // fmod c = vmulq_f32(aux, reciprocal); - i = vcvtq_s32_f32(c); + i = vcvtq_s32_f32(c); cTrunc = vcvtq_f32_s32(i); base = vmulq_f32(cTrunc, code_length_chips_reg_f); aux = vsubq_f32(aux, base); @@ -580,13 +588,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = vaddq_f32(indexn, fours); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap @@ -604,4 +612,3 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul #endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h index a31cba3a5..d583595a4 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn.h @@ -95,69 +95,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) +static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; lv_16sc_t** _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; float tmp_rem_code_phase_chips; - __m128 _rem_code_phase,_code_phase_step_chips; - __m128i _code_length_chips,_code_length_chips_minus1; - __m128 _code_phase_out,_code_phase_out_with_offset; + __m128 _rem_code_phase, _code_phase_step_chips; + __m128i _code_length_chips, _code_length_chips_minus1; + __m128 _code_phase_out, _code_phase_out_with_offset; - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register - __m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over; + __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i zero = _mm_setzero_si128(); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; __m128 _4output_index = _mm_load_ps(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; __m128 _4constant_float = _mm_load_ps(init_4constant_float); int current_vector = 0; int sample_idx = 0; - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { //common to all outputs - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step //output vector dependant (different code phase offset) - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { - tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register + tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) + _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); - _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; @@ -169,9 +174,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* sample_idx += 4; } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; @@ -186,69 +191,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t* #ifdef LV_HAVE_SSE2 #include -static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) +static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; lv_16sc_t** _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; float tmp_rem_code_phase_chips; - __m128 _rem_code_phase,_code_phase_step_chips; - __m128i _code_length_chips,_code_length_chips_minus1; - __m128 _code_phase_out,_code_phase_out_with_offset; + __m128 _rem_code_phase, _code_phase_step_chips; + __m128i _code_length_chips, _code_length_chips_minus1; + __m128 _code_phase_out, _code_phase_out_with_offset; - _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; + _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register - _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register + _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register + _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register - __m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over; + __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i zero = _mm_setzero_si128(); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; __m128 _4output_index = _mm_loadu_ps(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; __m128 _4constant_float = _mm_loadu_ps(init_4constant_float); int current_vector = 0; int sample_idx = 0; - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { //common to all outputs - _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step //output vector dependant (different code phase offset) - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { - tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register + tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) + _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register - _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset - _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer + _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer - negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg))); - _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; @@ -260,9 +270,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* sample_idx += 4; } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; @@ -278,74 +288,79 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t* #ifdef LV_HAVE_NEON #include -static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) +static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) { unsigned int number; const unsigned int quarterPoints = num_output_samples / 4; float32x4_t half = vdupq_n_f32(0.5f); lv_16sc_t** _result = result; - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; float tmp_rem_code_phase_chips; float32x4_t _rem_code_phase, _code_phase_step_chips; int32x4_t _code_length_chips, _code_length_chips_minus1; float32x4_t _code_phase_out, _code_phase_out_with_offset; float32x4_t sign, PlusHalf, Round; - _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; + _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips_minus1[4]; four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1; - __VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; + __VOLK_ATTR_ALIGNED(16) + int four_times_code_length_chips[4]; four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[3] = code_length_chips; - _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register - _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register + _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register + _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register - int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; + int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; uint32x4_t negative_indexes, overflow_indexes; int32x4_t zero = vmovq_n_s32(0); - __VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f}; float32x4_t _4output_index = vld1q_f32(init_idx_float); - __VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; + __VOLK_ATTR_ALIGNED(16) + float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f}; float32x4_t _4constant_float = vld1q_f32(init_4constant_float); int current_vector = 0; int sample_idx = 0; - for(number = 0; number < quarterPoints; number++) + for (number = 0; number < quarterPoints; number++) { //common to all outputs - _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step + _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step //output vector dependant (different code phase offset) - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { - tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) - _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register + tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) + _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register - _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset + _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset //_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); Round = vsubq_f32(PlusHalf, sign); _code_phase_out_int = vcvtq_s32_f32(Round); - negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values - _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch - _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int ))); + negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values + _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch + _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int))); - overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values - _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch - _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg ))); + overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values + _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch + _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg))); - vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back + vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back //todo: optimize the local code lookup table with intrinsics, if possible _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; @@ -357,9 +372,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** sample_idx += 4; } - for(number = quarterPoints * 4; number < num_output_samples; number++) + for (number = quarterPoints * 4; number < num_output_samples; number++) { - for(current_vector = 0; current_vector < num_out_vectors; current_vector++) + for (current_vector = 0; current_vector < num_out_vectors; current_vector++) { local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_32u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_32u.h index af5e609cb..ace8271ea 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_32u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_index_max_32u.h @@ -29,7 +29,6 @@ */ - /*! * \page volk_gnsssdr_32f_index_max_32u.h * @@ -63,7 +62,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 8; @@ -71,7 +70,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* inputPtr = (float*)src0; __m256 indexIncrementValues = _mm256_set1_ps(8); - __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); float max = src0[0]; float index = 0; @@ -80,25 +79,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const __m256 compareResults; __m256 currentValues; - __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) + float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) + float maxIndexesBuffer[8]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; + currentValues = _mm256_load_ps(inputPtr); + inputPtr += 8; currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); + maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); } // Calculate the largest value from the remaining 8 points _mm256_store_ps(maxValuesBuffer, maxValues); _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 8; number++) + for (number = 0; number < 8; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -106,9 +108,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const } number = quarterPoints * 8; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -126,7 +128,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 8; @@ -134,7 +136,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* inputPtr = (float*)src0; __m256 indexIncrementValues = _mm256_set1_ps(8); - __m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); + __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8); float max = src0[0]; float index = 0; @@ -143,25 +145,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const __m256 compareResults; __m256 currentValues; - __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; - __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) + float maxValuesBuffer[8]; + __VOLK_ATTR_ALIGNED(32) + float maxIndexesBuffer[8]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; + currentValues = _mm256_loadu_ps(inputPtr); + inputPtr += 8; currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); + maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); } // Calculate the largest value from the remaining 8 points _mm256_store_ps(maxValuesBuffer, maxValues); _mm256_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 8; number++) + for (number = 0; number < 8; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -169,9 +174,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const } number = quarterPoints * 8; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -185,11 +190,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const #ifdef LV_HAVE_SSE4_1 -#include +#include static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; @@ -197,7 +202,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con float* inputPtr = (float*)src0; __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); float max = src0[0]; float index = 0; @@ -206,25 +211,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con __m128 compareResults; __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues); maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); } // Calculate the largest value from the remaining 4 points _mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -232,9 +240,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -248,11 +256,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con #ifdef LV_HAVE_SSE4_1 -#include +#include static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; @@ -260,7 +268,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con float* inputPtr = (float*)src0; __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); float max = src0[0]; float index = 0; @@ -269,25 +277,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con __m128 compareResults; __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues); maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); - maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); + maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); } // Calculate the largest value from the remaining 4 points _mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -295,9 +306,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -312,11 +323,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con #ifdef LV_HAVE_SSE -#include +#include static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; @@ -324,7 +335,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* inputPtr = (float*)src0; __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); float max = src0[0]; float index = 0; @@ -333,25 +344,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const __m128 compareResults; __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm_load_ps(inputPtr); inputPtr += 4; + currentValues = _mm_load_ps(inputPtr); + inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues); - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes)); + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues)); } // Calculate the largest value from the remaining 4 points _mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -359,9 +373,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -376,11 +390,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const #ifdef LV_HAVE_SSE -#include +#include static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; @@ -388,7 +402,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* inputPtr = (float*)src0; __m128 indexIncrementValues = _mm_set1_ps(4); - __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); + __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4); float max = src0[0]; float index = 0; @@ -397,25 +411,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const __m128 compareResults; __m128 currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; + currentValues = _mm_loadu_ps(inputPtr); + inputPtr += 4; currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues); - maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); - maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); + maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes)); + maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues)); } // Calculate the largest value from the remaining 4 points _mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxIndexesBuffer, maxValuesIndex); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -423,9 +440,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -442,16 +459,16 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { float max = src0[0]; uint32_t index = 0; uint32_t i = 1; - for(; i < num_points; ++i) + for (; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -469,14 +486,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, cons static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) { - if(num_points > 0) + if (num_points > 0) { uint32_t number = 0; const uint32_t quarterPoints = num_points / 4; float* inputPtr = (float*)src0; float32x4_t indexIncrementValues = vdupq_n_f32(4); - __VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; + __VOLK_ATTR_ALIGNED(16) + float currentIndexes_float[4] = {-4.0f, -3.0f, -2.0f, -1.0f}; float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); float max = src0[0]; @@ -487,25 +505,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f uint32x4_t currentIndexes_u; float32x4_t currentValues; - __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; - __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxValuesBuffer[4]; + __VOLK_ATTR_ALIGNED(16) + float maxIndexesBuffer[4]; - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { - currentValues = vld1q_f32(inputPtr); inputPtr += 4; - currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); + currentValues = vld1q_f32(inputPtr); + inputPtr += 4; + currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); currentIndexes_u = vcvtq_u32_f32(currentIndexes); - compareResults = vcgtq_f32( maxValues, currentValues); - maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) ); - maxValues = vmaxq_f32(currentValues, maxValues); + compareResults = vcgtq_f32(maxValues, currentValues); + maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex), vbicq_u32(currentIndexes_u, compareResults)); + maxValues = vmaxq_f32(currentValues, maxValues); } // Calculate the largest value from the remaining 4 points vst1q_f32(maxValuesBuffer, maxValues); vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); - for(number = 0; number < 4; number++) + for (number = 0; number < 4; number++) { - if(maxValuesBuffer[number] > max) + if (maxValuesBuffer[number] > max) { index = maxIndexesBuffer[number]; max = maxValuesBuffer[number]; @@ -513,9 +534,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { - if(src0[number] > max) + if (src0[number] > max) { index = number; max = src0[number]; @@ -528,4 +549,3 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f #endif /*LV_HAVE_NEON*/ #endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h index cf2a80f52..b425ecb9b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_resamplerxnpuppet_32f.h @@ -42,31 +42,30 @@ #include - #ifdef LV_HAVE_GENERIC static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -77,26 +76,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -106,26 +105,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -136,26 +135,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -165,26 +164,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -194,26 +193,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -223,26 +222,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, c static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -251,29 +250,28 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, c static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); - } + float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H - +#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h index 5568976e8..b067c5f3d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_sincos_32fc.h @@ -97,7 +97,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f cp4 = _mm_set1_ps(0.49603e-4); cp5 = _mm_set1_ps(0.551e-6); - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { aVal = _mm_loadu_ps(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -108,12 +108,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction s = _mm_mul_ps(s, s); // Evaluate Taylor series s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - for(i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); } @@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { float _in = *aPtr++; *bPtr++ = lv_cmake(cosf(_in), sinf(_in)); @@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f cp4 = _mm_set1_ps(0.49603e-4); cp5 = _mm_set1_ps(0.551e-6); - for(;number < quarterPoints; number++) + for (; number < quarterPoints; number++) { aVal = _mm_load_ps(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -202,12 +202,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); - s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction + s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction s = _mm_mul_ps(s, s); // Evaluate Taylor series s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); - for(i = 0; i < 3; i++) + for (i = 0; i < 3; i++) { s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); } @@ -239,7 +239,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f } number = quarterPoints * 4; - for(;number < num_points; number++) + for (; number < num_points; number++) { float _in = *aPtr++; *bPtr++ = lv_cmake(cosf(_in), sinf(_in)); @@ -265,31 +265,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo __m128 sine, cosine, aux, x; __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; - __m128i emm0, emm2, emm4; + __m128i emm0, emm2, emm4; /* declare some SSE constants */ - __VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_1[4] = {1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_2[4] = {2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_4[4] = {4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f}; - for(;number < sse_iters; number++) + for (; number < sse_iters; number++) { x = _mm_load_ps(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -307,19 +325,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1); - emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1); + emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ - emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4); + emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ - emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); @@ -335,15 +353,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); - emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2); - emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4); + emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); + emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x,x); + __m128 z = _mm_mul_ps(x, x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); @@ -371,11 +389,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); + y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); /* update the sign */ sine = _mm_xor_ps(xmm1, sign_bit_sin); @@ -392,12 +410,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo aPtr += 4; } - for(number = sse_iters * 4; number < num_points; number++) + for (number = sse_iters * 4; number < num_points; number++) { _in = *aPtr++; - *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); + *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); } - } #endif /* LV_HAVE_SSE2 */ @@ -418,31 +435,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo __m128 sine, cosine, aux, x; __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; - __m128i emm0, emm2, emm4; + __m128i emm0, emm2, emm4; /* declare some SSE constants */ - __VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_1[4] = {1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_2[4] = {2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_4[4] = {4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f}; - for(;number < sse_iters; number++) + for (; number < sse_iters; number++) { x = _mm_loadu_ps(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -460,19 +495,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1); - emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1); + emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ - emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4); + emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ - emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2); + emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); @@ -488,15 +523,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); - emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2); - emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4); + emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); + emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x,x); + __m128 z = _mm_mul_ps(x, x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); @@ -524,11 +559,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); + y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); /* update the sign */ sine = _mm_xor_ps(xmm1, sign_bit_sin); @@ -545,12 +580,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo aPtr += 4; } - for(number = sse_iters * 4; number < num_points; number++) + for (number = sse_iters * 4; number < num_points; number++) { _in = *aPtr++; - *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); + *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); } - } #endif /* LV_HAVE_SSE2 */ @@ -561,10 +595,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic(lv_32fc_t* out, const fl { float _in; unsigned int i; - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { _in = *in++; - *out++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); + *out++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); } } @@ -586,12 +620,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con const int32_t diffbits = bitlength - Nbits; uint32_t ux; unsigned int i; - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { _in = *in++; d = (int32_t)floor(_in / TWO_PI + 0.5); _in -= d * TWO_PI; - x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI); + x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI); ux = x; sin_index = ux >> diffbits; @@ -601,7 +635,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con cos_index = ux >> diffbits; c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1]; - *out++ = lv_cmake((float)c, (float)s ); + *out++ = lv_cmake((float)c, (float)s); } } @@ -637,7 +671,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos; - for(;number < neon_iters; number++) + for (; number < neon_iters; number++) { x = vld1q_f32(aPtr); __VOLK_GNSSSDR_PREFETCH(aPtr + 8); @@ -677,7 +711,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float /* Evaluate the first polynom (0 <= x <= Pi/4) in y1, and the second polynom (Pi/4 <= x <= 0) in y2 */ - z = vmulq_f32(x,x); + z = vmulq_f32(x, x); y1 = vmulq_n_f32(z, c_coscof_p0); y2 = vmulq_n_f32(z, c_sincof_p0); @@ -706,10 +740,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float aPtr += 4; } - for(number = neon_iters * 4; number < num_points; number++) + for (number = neon_iters * 4; number < num_points; number++) { _in = *aPtr++; - *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); + *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); } } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h index 1fa95e0e6..f130032ea 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32f_xn_resampler_32f_xn.h @@ -110,7 +110,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -124,7 +125,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -145,25 +146,25 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } } } -#endif +#endif #ifdef LV_HAVE_SSE3 @@ -180,7 +181,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -194,7 +196,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -215,18 +217,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -248,7 +250,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -262,7 +265,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -280,25 +283,25 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result, aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } } } -#endif +#endif #ifdef LV_HAVE_SSE4_1 @@ -314,7 +317,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -328,7 +332,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -346,18 +350,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result, aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -380,7 +384,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -395,7 +400,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -413,13 +418,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -429,12 +434,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -457,7 +462,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -472,7 +478,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -490,13 +496,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -506,12 +512,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -536,19 +542,21 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int32_t local_code_chip_index[4]; int32_t local_code_chip_index_; const int32x4_t zeros = vdupq_n_s32(0); const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); - int32x4_t local_code_chip_index_reg, aux_i, negatives, i; + int32x4_t local_code_chip_index_reg, aux_i, negatives, i; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; - __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f}; uint32x4_t igx; reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! float32x4_t n0 = vld1q_f32((float*)vec); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) @@ -556,7 +564,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < neon_iters; n++) + for (n = 0; n < neon_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); @@ -572,7 +580,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con // fmod c = vmulq_f32(aux, reciprocal); - i = vcvtq_s32_f32(c); + i = vcvtq_s32_f32(c); cTrunc = vcvtq_f32_s32(i); base = vmulq_f32(cTrunc, code_length_chips_reg_f); aux = vsubq_f32(aux, base); @@ -584,13 +592,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = vaddq_f32(indexn, fours); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap @@ -606,5 +614,3 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con #endif #endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/ - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h index e8831a97f..211d979cf 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn.h @@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points; n++) { - tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); // Regenerate phase if (n % 256 == 0) @@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload unsigned int j; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) @@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload result[n_vec] += tmp32_2; } } - /* Regenerate phase */ + /* Regenerate phase */ #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -175,8 +175,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ const unsigned int sixteenthPoints = num_points / 16; const float* aPtr = (float*)in_common; - const float* bPtr[ num_a_vectors]; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + const float* bPtr[num_a_vectors]; + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { bPtr[vec_ind] = in_a[vec_ind]; } @@ -194,7 +194,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ __m256 dotProdVal2[num_a_vectors]; __m256 dotProdVal3[num_a_vectors]; - for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ ) + for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++) { dotProdVal0[vec_ind] = _mm256_setzero_ps(); dotProdVal1[vec_ind] = _mm256_setzero_ps(); @@ -204,57 +204,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ // Set up the complex rotator __m256 z0, z1, z2, z3; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16]; - for( vec_ind = 0; vec_ind < 16; ++vec_ind ) + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t phase_vec[16]; + for (vec_ind = 0; vec_ind < 16; ++vec_ind) { phase_vec[vec_ind] = _phase; _phase *= phase_inc; } - z0 = _mm256_load_ps( (float *)phase_vec ); - z1 = _mm256_load_ps( (float *)(phase_vec + 4) ); - z2 = _mm256_load_ps( (float *)(phase_vec + 8) ); - z3 = _mm256_load_ps( (float *)(phase_vec + 12) ); + z0 = _mm256_load_ps((float*)phase_vec); + z1 = _mm256_load_ps((float*)(phase_vec + 4)); + z2 = _mm256_load_ps((float*)(phase_vec + 8)); + z3 = _mm256_load_ps((float*)(phase_vec + 12)); - lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; + lv_32fc_t dz = phase_inc; + dz *= dz; + dz *= dz; + dz *= dz; + dz *= dz; // dz = phase_inc^16; - for( vec_ind = 0; vec_ind < 4; ++vec_ind ) + for (vec_ind = 0; vec_ind < 4; ++vec_ind) { phase_vec[vec_ind] = dz; } - __m256 dz_reg = _mm256_load_ps( (float *)phase_vec ); - dz_reg = _mm256_complexnormalise_ps( dz_reg ); + __m256 dz_reg = _mm256_load_ps((float*)phase_vec); + dz_reg = _mm256_complexnormalise_ps(dz_reg); - for(;number < sixteenthPoints; number++) + for (; number < sixteenthPoints; number++) { a0Val = _mm256_loadu_ps(aPtr); - a1Val = _mm256_loadu_ps(aPtr+8); - a2Val = _mm256_loadu_ps(aPtr+16); - a3Val = _mm256_loadu_ps(aPtr+24); + a1Val = _mm256_loadu_ps(aPtr + 8); + a2Val = _mm256_loadu_ps(aPtr + 16); + a3Val = _mm256_loadu_ps(aPtr + 24); - a0Val = _mm256_complexmul_ps( a0Val, z0 ); - a1Val = _mm256_complexmul_ps( a1Val, z1 ); - a2Val = _mm256_complexmul_ps( a2Val, z2 ); - a3Val = _mm256_complexmul_ps( a3Val, z3 ); + a0Val = _mm256_complexmul_ps(a0Val, z0); + a1Val = _mm256_complexmul_ps(a1Val, z1); + a2Val = _mm256_complexmul_ps(a2Val, z2); + a3Val = _mm256_complexmul_ps(a3Val, z3); - z0 = _mm256_complexmul_ps( z0, dz_reg ); - z1 = _mm256_complexmul_ps( z1, dz_reg ); - z2 = _mm256_complexmul_ps( z2, dz_reg ); - z3 = _mm256_complexmul_ps( z3, dz_reg ); + z0 = _mm256_complexmul_ps(z0, dz_reg); + z1 = _mm256_complexmul_ps(z1, dz_reg); + z2 = _mm256_complexmul_ps(z2, dz_reg); + z3 = _mm256_complexmul_ps(z3, dz_reg); - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { - x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 - x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8); - x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 - x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 + x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8); + x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]); x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]); // TODO: it may be possible to rearrange swizzling to better pipeline data - b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 - b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20); b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31); @@ -274,43 +279,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_ // Force the rotators back onto the unit circle if ((number % 64) == 0) { - z0 = _mm256_complexnormalise_ps( z0 ); - z1 = _mm256_complexnormalise_ps( z1 ); - z2 = _mm256_complexnormalise_ps( z2 ); - z3 = _mm256_complexnormalise_ps( z3 ); + z0 = _mm256_complexnormalise_ps(z0); + z1 = _mm256_complexnormalise_ps(z1); + z2 = _mm256_complexnormalise_ps(z2); + z3 = _mm256_complexnormalise_ps(z3); } aPtr += 32; } - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]); - _mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector - result[ vec_ind ] = lv_cmake( 0, 0 ); - for( i = 0; i < 4; ++i ) + result[vec_ind] = lv_cmake(0, 0); + for (i = 0; i < 4; ++i) { result[vec_ind] += dotProductVector[i]; } } - z0 = _mm256_complexnormalise_ps( z0 ); + z0 = _mm256_complexnormalise_ps(z0); _mm256_store_ps((float*)phase_vec, z0); _phase = phase_vec[0]; _mm256_zeroupper(); - number = sixteenthPoints*16; - for(;number < num_points; number++) + number = sixteenthPoints * 16; + for (; number < num_points; number++) { - wo = (*aPtr++)*_phase; + wo = (*aPtr++) * _phase; _phase *= phase_inc; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { result[vec_ind] += wo * in_a[vec_ind][number]; } @@ -333,8 +339,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ const unsigned int sixteenthPoints = num_points / 16; const float* aPtr = (float*)in_common; - const float* bPtr[ num_a_vectors]; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + const float* bPtr[num_a_vectors]; + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { bPtr[vec_ind] = in_a[vec_ind]; } @@ -352,7 +358,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ __m256 dotProdVal2[num_a_vectors]; __m256 dotProdVal3[num_a_vectors]; - for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ ) + for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++) { dotProdVal0[vec_ind] = _mm256_setzero_ps(); dotProdVal1[vec_ind] = _mm256_setzero_ps(); @@ -362,58 +368,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ // Set up the complex rotator __m256 z0, z1, z2, z3; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16]; - for( vec_ind = 0; vec_ind < 16; ++vec_ind ) + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t phase_vec[16]; + for (vec_ind = 0; vec_ind < 16; ++vec_ind) { phase_vec[vec_ind] = _phase; _phase *= phase_inc; } - z0 = _mm256_load_ps( (float *)phase_vec ); - z1 = _mm256_load_ps( (float *)(phase_vec + 4) ); - z2 = _mm256_load_ps( (float *)(phase_vec + 8) ); - z3 = _mm256_load_ps( (float *)(phase_vec + 12) ); + z0 = _mm256_load_ps((float*)phase_vec); + z1 = _mm256_load_ps((float*)(phase_vec + 4)); + z2 = _mm256_load_ps((float*)(phase_vec + 8)); + z3 = _mm256_load_ps((float*)(phase_vec + 12)); - lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; + lv_32fc_t dz = phase_inc; + dz *= dz; + dz *= dz; + dz *= dz; + dz *= dz; // dz = phase_inc^16; - for( vec_ind = 0; vec_ind < 4; ++vec_ind ) + for (vec_ind = 0; vec_ind < 4; ++vec_ind) { phase_vec[vec_ind] = dz; } - __m256 dz_reg = _mm256_load_ps( (float *)phase_vec ); - dz_reg = _mm256_complexnormalise_ps( dz_reg ); + __m256 dz_reg = _mm256_load_ps((float*)phase_vec); + dz_reg = _mm256_complexnormalise_ps(dz_reg); - for(;number < sixteenthPoints; number++) + for (; number < sixteenthPoints; number++) { - a0Val = _mm256_load_ps(aPtr); - a1Val = _mm256_load_ps(aPtr+8); - a2Val = _mm256_load_ps(aPtr+16); - a3Val = _mm256_load_ps(aPtr+24); + a1Val = _mm256_load_ps(aPtr + 8); + a2Val = _mm256_load_ps(aPtr + 16); + a3Val = _mm256_load_ps(aPtr + 24); - a0Val = _mm256_complexmul_ps( a0Val, z0 ); - a1Val = _mm256_complexmul_ps( a1Val, z1 ); - a2Val = _mm256_complexmul_ps( a2Val, z2 ); - a3Val = _mm256_complexmul_ps( a3Val, z3 ); + a0Val = _mm256_complexmul_ps(a0Val, z0); + a1Val = _mm256_complexmul_ps(a1Val, z1); + a2Val = _mm256_complexmul_ps(a2Val, z2); + a3Val = _mm256_complexmul_ps(a3Val, z3); - z0 = _mm256_complexmul_ps( z0, dz_reg ); - z1 = _mm256_complexmul_ps( z1, dz_reg ); - z2 = _mm256_complexmul_ps( z2, dz_reg ); - z3 = _mm256_complexmul_ps( z3, dz_reg ); + z0 = _mm256_complexmul_ps(z0, dz_reg); + z1 = _mm256_complexmul_ps(z1, dz_reg); + z2 = _mm256_complexmul_ps(z2, dz_reg); + z3 = _mm256_complexmul_ps(z3, dz_reg); - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { - x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 - x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8); - x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 - x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 + x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 + x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8); + x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 + x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]); x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]); // TODO: it may be possible to rearrange swizzling to better pipeline data - b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 - b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 + b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 + b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20); b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31); @@ -433,43 +443,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ // Force the rotators back onto the unit circle if ((number % 64) == 0) { - z0 = _mm256_complexnormalise_ps( z0 ); - z1 = _mm256_complexnormalise_ps( z1 ); - z2 = _mm256_complexnormalise_ps( z2 ); - z3 = _mm256_complexnormalise_ps( z3 ); + z0 = _mm256_complexnormalise_ps(z0); + z1 = _mm256_complexnormalise_ps(z1); + z2 = _mm256_complexnormalise_ps(z2); + z3 = _mm256_complexnormalise_ps(z3); } aPtr += 32; } - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]); - _mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector + _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector - result[ vec_ind ] = lv_cmake( 0, 0 ); - for( i = 0; i < 4; ++i ) + result[vec_ind] = lv_cmake(0, 0); + for (i = 0; i < 4; ++i) { result[vec_ind] += dotProductVector[i]; } } - z0 = _mm256_complexnormalise_ps( z0 ); + z0 = _mm256_complexnormalise_ps(z0); _mm256_store_ps((float*)phase_vec, z0); _phase = phase_vec[0]; _mm256_zeroupper(); - number = sixteenthPoints*16; - for(;number < num_points; number++) + number = sixteenthPoints * 16; + for (; number < num_points; number++) { - wo = (*aPtr++)*_phase; + wo = (*aPtr++) * _phase; _phase *= phase_inc; - for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) + for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) { result[vec_ind] += wo * in_a[vec_ind][number]; } @@ -482,5 +493,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_ #endif /* LV_HAVE_AVX */ #endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */ - - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h index ca684e30b..0804dd651 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc.h @@ -42,7 +42,7 @@ #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -53,15 +53,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); unsigned int n; int num_a_vectors = 3; - float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) { - in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } - volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -82,15 +82,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); unsigned int n; int num_a_vectors = 3; - float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) { - in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } - volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -100,7 +100,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re #endif // Generic #ifdef LV_HAVE_AVX -static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -111,15 +111,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3 phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); unsigned int n; int num_a_vectors = 3; - float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) { - in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } - volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -130,7 +130,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3 #ifdef LV_HAVE_AVX -static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -141,15 +141,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3 phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); unsigned int n; int num_a_vectors = 3; - float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_a_vectors; n++) { - in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); + in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); } - volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -159,4 +159,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3 #endif // AVX #endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h index b04a93c4b..892a7c0e8 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_16ic.h @@ -80,10 +80,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); // Clip @@ -99,12 +101,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector outputVectorPtr += 8; } - for(i = sse_iters * 8; i < num_points * 2; i++) + for (i = sse_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -128,15 +130,17 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, const float max_val = (float)SHRT_MAX; __m128 inputVal1, inputVal2; - __m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h? + __m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h? __m128 ret1, ret2; const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0;i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); // Clip @@ -152,12 +156,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector, outputVectorPtr += 8; } - for(i = sse_iters * 8; i < num_points*2; i++) + for (i = sse_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -175,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector int16_t* outputVectorPtr = (int16_t*)outputVector; float aux; unsigned int i; - const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast + const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast const float max_val = (float)SHRT_MAX; __m256 inputVal1, inputVal2; @@ -184,10 +188,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmax_val = _mm256_set1_ps(max_val); - for(i = 0; i < avx2_iters; i++) + for (i = 0; i < avx2_iters; i++) { - inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); // Clip @@ -204,12 +210,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector outputVectorPtr += 16; } - for(i = avx2_iters * 16; i < num_points * 2; i++) + for (i = avx2_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -238,10 +244,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); // Clip @@ -257,12 +265,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector outputVectorPtr += 8; } - for(i = sse_iters * 8; i < num_points * 2; i++) + for (i = sse_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -289,10 +297,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); // Clip @@ -308,12 +318,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector, outputVectorPtr += 8; } - for(i = sse_iters * 8; i < num_points * 2; i++) + for (i = sse_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -332,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector int16_t* outputVectorPtr = (int16_t*)outputVector; float aux; unsigned int i; - const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast + const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast const float max_val = (float)SHRT_MAX; __m256 inputVal1, inputVal2; @@ -341,10 +351,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmax_val = _mm256_set1_ps(max_val); - for(i = 0; i < avx2_iters; i++) + for (i = 0; i < avx2_iters; i++) { - inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal1 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); // Clip @@ -361,12 +373,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector outputVectorPtr += 16; } - for(i = avx2_iters * 16; i < num_points * 2; i++) + for (i = avx2_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -397,10 +409,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector, int16x4_t intInputVal1, intInputVal2; int16x8_t res; - for(i = 0; i < neon_iters; i++) + for (i = 0; i < neon_iters; i++) { - a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; - b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)(inputVectorPtr)); + inputVectorPtr += 4; + b = vld1q_f32((const float32_t*)(inputVectorPtr)); + inputVectorPtr += 4; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); @@ -425,12 +439,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector, outputVectorPtr += 8; } - for(i = neon_iters * 8; i < num_points * 2; i++) + for (i = neon_iters * 8; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val_f) + if (aux > max_val_f) aux = max_val_f; - else if(aux < min_val_f) + else if (aux < min_val_f) aux = min_val_f; *outputVectorPtr++ = (int16_t)rintf(aux); } @@ -449,14 +463,14 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto const float max_val = (float)SHRT_MAX; float aux; unsigned int i; - for(i = 0; i < num_points * 2; i++) + for (i = 0; i < num_points * 2; i++) { aux = *inputVectorPtr++; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; - *outputVectorPtr++ = (int16_t)rintf(aux); + *outputVectorPtr++ = (int16_t)rintf(aux); } } #endif /* LV_HAVE_GENERIC */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h old mode 100755 new mode 100644 index ca5f13f22..ab8d32e32 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h @@ -72,12 +72,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector, const float max_val = (float)SCHAR_MAX; float aux; unsigned int i; - for(i = 0; i < num_points * 2; i++) + for (i = 0; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -107,12 +107,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmax_val = _mm256_set1_ps(max_val); - for(i = 0; i < avx2_iters; i++) + for (i = 0; i < avx2_iters; i++) { - inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 8; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); @@ -142,12 +146,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector, outputVectorPtr += 32; } - for(i = avx2_iters * 32; i < num_points * 2; i++) + for (i = avx2_iters * 32; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -177,12 +181,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmax_val = _mm256_set1_ps(max_val); - for(i = 0; i < avx2_iters; i++) + for (i = 0; i < avx2_iters; i++) { - inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; - inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; + inputVal1 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal2 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal3 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; + inputVal4 = _mm256_load_ps((float*)inputVectorPtr); + inputVectorPtr += 8; __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); @@ -212,12 +220,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector, outputVectorPtr += 32; } - for(i = avx2_iters * 32; i < num_points * 2; i++) + for (i = avx2_iters * 32; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -247,12 +255,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); + inputVectorPtr += 4; inputVal1 = _mm_mul_ps(inputVal1, vmax_val); inputVal2 = _mm_mul_ps(inputVal2, vmax_val); @@ -278,12 +290,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector, outputVectorPtr += 16; } - for(i = sse_iters * 16; i < num_points * 2; i++) + for (i = sse_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -313,12 +325,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmax_val = _mm_set_ps1(max_val); - for(i = 0; i < sse_iters; i++) + for (i = 0; i < sse_iters; i++) { - inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; - inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; + inputVal1 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal2 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal3 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; + inputVal4 = _mm_load_ps((float*)inputVectorPtr); + inputVectorPtr += 4; inputVal1 = _mm_mul_ps(inputVal1, vmax_val); inputVal2 = _mm_mul_ps(inputVal2, vmax_val); @@ -344,12 +360,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector, outputVectorPtr += 16; } - for(i = sse_iters * 16; i < num_points * 2; i++) + for (i = sse_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val; - if(aux > max_val) + if (aux > max_val) aux = max_val; - else if(aux < min_val) + else if (aux < min_val) aux = min_val; *outputVectorPtr++ = (int8_t)rintf(aux); } @@ -383,9 +399,10 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co int8x8_t res8_1, res8_2; int8x16_t outputVal; - for(i = 0; i < neon_iters; i++) + for (i = 0; i < neon_iters; i++) { - a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)inputVectorPtr); + inputVectorPtr += 4; a = vmulq_f32(a, max_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); @@ -394,7 +411,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co toint_a = vcvtq_s32_f32(Round); intInputVal1 = vqmovn_s32(toint_a); - a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)inputVectorPtr); + inputVectorPtr += 4; a = vmulq_f32(a, max_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); @@ -406,7 +424,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2); res8_1 = vqmovn_s16(pack16_8_1); - a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)inputVectorPtr); + inputVectorPtr += 4; a = vmulq_f32(a, max_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); @@ -415,7 +434,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co toint_a = vcvtq_s32_f32(Round); intInputVal1 = vqmovn_s32(toint_a); - a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; + a = vld1q_f32((const float32_t*)inputVectorPtr); + inputVectorPtr += 4; a = vmulq_f32(a, max_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); @@ -433,12 +453,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co outputVectorPtr += 16; } - for(i = neon_iters * 16; i < num_points * 2; i++) + for (i = neon_iters * 16; i < num_points * 2; i++) { aux = *inputVectorPtr++ * max_val_f; - if(aux > max_val_f) + if (aux > max_val_f) aux = max_val_f; - else if(aux < min_val_f) + else if (aux < min_val_f) aux = min_val_f; *outputVectorPtr++ = (int8_t)rintf(aux); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h index 9348c09fc..1655b5ccd 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_resamplerxnpuppet_32fc.h @@ -42,31 +42,30 @@ #include - #ifdef LV_HAVE_GENERIC static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -78,26 +77,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -107,26 +106,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -137,26 +136,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -166,26 +165,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } @@ -195,26 +194,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -224,26 +223,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -253,26 +252,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -282,26 +281,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* re static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif @@ -311,28 +310,28 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* re static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) { int code_length_chips = 2046; - float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); + float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points); int num_out_vectors = 3; float rem_code_phase_chips = -0.234; unsigned int n; - float shifts_chips[3] = { -0.1, 0.0, 0.1 }; + float shifts_chips[3] = {-0.1, 0.0, 0.1}; - lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_out_vectors; n++) - { - result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); - } + lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); + for (n = 0; n < num_out_vectors; n++) + { + result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); + } volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); - for(n = 0; n < num_out_vectors; n++) - { - volk_gnsssdr_free(result_aux[n]); - } + for (n = 0; n < num_out_vectors; n++) + { + volk_gnsssdr_free(result_aux[n]); + } volk_gnsssdr_free(result_aux); } #endif -#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H +#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h index a25715749..c3c77233a 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn.h @@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc unsigned int n; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points; n++) { - tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); + tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); // Regenerate phase if (n % 256 == 0) @@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( unsigned int j; for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - result[n_vec] = lv_cmake(0,0); + result[n_vec] = lv_cmake(0, 0); } for (n = 0; n < num_points / ROTATOR_RELOAD; n++) @@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( result[n_vec] += tmp32_2; } } - /* Regenerate phase */ + /* Regenerate phase */ #ifdef __cplusplus (*phase) /= std::abs((*phase)); #else @@ -169,7 +169,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload( #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t tmp32_1, tmp32_2; const unsigned int sse_iters = num_points / 2; int n_vec; @@ -179,7 +179,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ const lv_32fc_t** _in_a = in_a; const lv_32fc_t* _in_common = in_common; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t dotProductVector[2]; __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); @@ -191,11 +192,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ // phase rotation registers __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -203,12 +206,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg); const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: a = _mm_loadu_ps((float*)_in_common); - // __VOLK_GNSSSDR_PREFETCH(_in_common + 4); - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + // __VOLK_GNSSSDR_PREFETCH(_in_common + 4); + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); tmp1 = _mm_mul_ps(a, yl); tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp); @@ -219,7 +222,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ z1 = _mm_addsub_ps(tmp1, tmp2); two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p); - yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr + yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(z1); //next two samples @@ -227,7 +230,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_loadu_ps((float*)&(_in_a[n_vec][number*2])); + a = _mm_loadu_ps((float*)&(_in_a[n_vec][number * 2])); tmp1 = _mm_mul_ps(a, yl); a = _mm_shuffle_ps(a, a, 0xB1); tmp2 = _mm_mul_ps(a, yh); @@ -247,8 +250,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 2; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -260,7 +263,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 2; n < num_points; n++) + for (n = sse_iters * 2; n < num_points; n++) { tmp32_1 = in_common[n] * (*phase); (*phase) *= phase_inc; @@ -278,7 +281,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_ #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t tmp32_1, tmp32_2; const unsigned int sse_iters = num_points / 2; int n_vec; @@ -288,7 +291,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ const lv_32fc_t** _in_a = in_a; const lv_32fc_t* _in_common = in_common; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t dotProductVector[2]; __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); @@ -300,11 +304,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ // phase rotation registers __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_inc[2]; two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc; - two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); - __VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; + two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); + __VOLK_ATTR_ALIGNED(16) + lv_32fc_t two_phase_acc[2]; two_phase_acc[0] = (*phase); two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); @@ -312,12 +318,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg); const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { // Phase rotation on operand in_common starts here: a = _mm_load_ps((float*)_in_common); - // __VOLK_GNSSSDR_PREFETCH(_in_common + 4); - yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr + // __VOLK_GNSSSDR_PREFETCH(_in_common + 4); + yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(two_phase_acc_reg); tmp1 = _mm_mul_ps(a, yl); tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp); @@ -328,7 +334,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ z1 = _mm_addsub_ps(tmp1, tmp2); two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p); - yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr + yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yh = _mm_movehdup_ps(z1); //next two samples @@ -336,7 +342,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - a = _mm_load_ps((float*)&(_in_a[n_vec][number*2])); + a = _mm_load_ps((float*)&(_in_a[n_vec][number * 2])); tmp1 = _mm_mul_ps(a, yl); a = _mm_shuffle_ps(a, a, 0xB1); tmp2 = _mm_mul_ps(a, yh); @@ -356,8 +362,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 2; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -369,7 +375,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); (*phase) = two_phase_acc[0]; - for(n = sse_iters * 2; n < num_points; n++) + for (n = sse_iters * 2; n < num_points; n++) { tmp32_1 = in_common[n] * (*phase); (*phase) *= phase_inc; @@ -387,7 +393,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_ #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t tmp32_1, tmp32_2; const unsigned int avx_iters = num_points / 4; int n_vec; @@ -398,7 +404,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t const lv_32fc_t* _in_common = in_common; lv_32fc_t _phase = (*phase); - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); @@ -431,12 +438,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg); const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { // Phase rotation on operand in_common starts here: a = _mm256_loadu_ps((float*)_in_common); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); - yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr + yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(four_phase_acc_reg); tmp1 = _mm256_mul_ps(a, yl); tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp); @@ -447,7 +454,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t z = _mm256_addsub_ps(tmp1, tmp2); four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p); - yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr + yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(z); //next two samples @@ -475,8 +482,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -492,10 +499,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); - _phase = four_phase_acc[0]; + _phase = four_phase_acc[0]; _mm256_zeroupper(); - for(n = avx_iters * 4; n < num_points; n++) + for (n = avx_iters * 4; n < num_points; n++) { tmp32_1 = *_in_common++ * _phase; _phase *= phase_inc; @@ -514,7 +521,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t #include static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); lv_32fc_t tmp32_1, tmp32_2; const unsigned int avx_iters = num_points / 4; int n_vec; @@ -525,7 +532,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t const lv_32fc_t* _in_common = in_common; lv_32fc_t _phase = (*phase); - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); @@ -538,7 +546,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t // phase rotation registers __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z; - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_inc[4]; const lv_32fc_t phase_inc2 = phase_inc * phase_inc; const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc; const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc; @@ -548,7 +557,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t four_phase_inc[3] = phase_inc4; const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); - __VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t four_phase_acc[4]; four_phase_acc[0] = _phase; four_phase_acc[1] = _phase * phase_inc; four_phase_acc[2] = _phase * phase_inc2; @@ -558,12 +568,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg); const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { // Phase rotation on operand in_common starts here: a = _mm256_load_ps((float*)_in_common); __VOLK_GNSSSDR_PREFETCH(_in_common + 16); - yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr + yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(four_phase_acc_reg); tmp1 = _mm256_mul_ps(a, yl); tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp); @@ -574,7 +584,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t z = _mm256_addsub_ps(tmp1, tmp2); four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p); - yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr + yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yh = _mm256_movehdup_ps(z); //next two samples @@ -602,8 +612,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -619,10 +629,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); - _phase = four_phase_acc[0]; + _phase = four_phase_acc[0]; _mm256_zeroupper(); - for(n = avx_iters * 4; n < num_points; n++) + for (n = avx_iters * 4; n < num_points; n++) { tmp32_1 = *_in_common++ * _phase; _phase *= phase_inc; @@ -646,7 +656,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* int n_vec; int i; unsigned int number; - unsigned int n ; + unsigned int n; const lv_32fc_t** _in_a = in_a; const lv_32fc_t* _in_common = in_common; lv_32fc_t* _out = result; @@ -656,36 +666,41 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* if (neon_iters > 0) { - lv_32fc_t dotProduct = lv_cmake(0,0); + lv_32fc_t dotProduct = lv_cmake(0, 0); float32_t arg_phase0 = cargf(_phase); float32_t arg_phase_inc = cargf(phase_inc); float32_t phase_est; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)}; float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); - lv_32fc_t phase2 = (lv_32fc_t)(_phase) * phase_inc; + lv_32fc_t phase2 = (lv_32fc_t)(_phase)*phase_inc; lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t __phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_imag = vld1q_f32(__phase_imag); - __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; + __VOLK_ATTR_ALIGNED(32) + lv_32fc_t dotProductVector[4]; float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag; float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment()); float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment()); - for(n_vec = 0; n_vec < num_a_vectors; n_vec++) + for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f); accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f); @@ -693,7 +708,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f); } - for(number = 0; number < neon_iters; number++) + for (number = 0; number < neon_iters; number++) { /* load 4 complex numbers (float 32 bits each component) */ b_val = vld2q_f32((float32_t*)_in_common); @@ -728,8 +743,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* phase3 = phase2 * phase_inc; phase4 = phase3 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; - __VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)}; + __VOLK_ATTR_ALIGNED(16) + float32_t ____phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)}; _phase_real = vld1q_f32(____phase_real); _phase_imag = vld1q_f32(____phase_imag); @@ -753,8 +770,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* } for (n_vec = 0; n_vec < num_a_vectors; n_vec++) { - vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector - dotProduct = lv_cmake(0,0); + vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector + dotProduct = lv_cmake(0, 0); for (i = 0; i < 4; ++i) { dotProduct = dotProduct + dotProductVector[i]; @@ -770,7 +787,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* _phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { tmp32_1 = in_common[n] * _phase; _phase *= phase_inc; @@ -786,4 +803,3 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t* #endif /* LV_HAVE_NEON */ #endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */ - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h index 3072542cf..846539fc9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc.h @@ -41,7 +41,7 @@ #include #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -53,14 +53,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_ unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_ #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -83,14 +83,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -101,7 +101,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel #ifdef LV_HAVE_SSE3 -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -113,14 +113,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3 unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -131,7 +131,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3 #ifdef LV_HAVE_SSE3 -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -143,14 +143,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3 unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -161,7 +161,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3 #ifdef LV_HAVE_AVX -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -173,14 +173,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32 unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32 #ifdef LV_HAVE_AVX -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -203,14 +203,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32 unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } @@ -221,7 +221,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32 #ifdef LV_HAVE_NEON -static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) +static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) { // phases must be normalized. Phase rotator expects a complex exponential input! float rem_carrier_phase_in_rad = 0.25; @@ -233,14 +233,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32f unsigned int n; int num_a_vectors = 3; lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); } - volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); + volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points); - for(n = 0; n < num_a_vectors; n++) + for (n = 0; n < num_a_vectors; n++) { volk_gnsssdr_free(in_a[n]); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h index f8db65944..3e6227a17 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_xn_resampler_32fc_xn.h @@ -107,7 +107,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -121,7 +122,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -142,18 +143,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -177,7 +178,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -191,7 +193,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -212,18 +214,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -245,7 +247,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -259,7 +262,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -277,18 +280,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -311,7 +314,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int local_code_chip_index[4]; int local_code_chip_index_; const __m128i zeros = _mm_setzero_si128(); @@ -325,7 +329,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); - for(n = 0; n < quarterPoints; n++) + for (n = 0; n < quarterPoints; n++) { aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_add_ps(aux, aux2); @@ -343,18 +347,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = _mm_add_ps(indexn, fours); } - for(n = quarterPoints * 4; n < num_points; n++) + for (n = quarterPoints * 4; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -377,7 +381,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -392,7 +397,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -410,13 +415,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -426,12 +431,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -454,7 +459,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -469,7 +475,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -487,13 +493,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -503,12 +509,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -531,7 +537,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -546,7 +553,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -565,13 +572,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -581,12 +588,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -609,7 +616,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; + __VOLK_ATTR_ALIGNED(32) + int local_code_chip_index[8]; int local_code_chip_index_; const __m256 zeros = _mm256_setzero_ps(); @@ -624,7 +632,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < avx_iters; n++) + for (n = 0; n < avx_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); @@ -643,13 +651,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res // no negatives c = _mm256_cvtepi32_ps(local_code_chip_index_reg); - negatives = _mm256_cmp_ps(c, zeros, 0x01 ); + negatives = _mm256_cmp_ps(c, zeros, 0x01); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux = _mm256_add_ps(c, aux3); local_code_chip_index_reg = _mm256_cvttps_epi32(aux); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 8; ++k) + for (k = 0; k < 8; ++k) { _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; } @@ -659,12 +667,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res _mm256_zeroupper(); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) { - for(n = avx_iters * 8; n < num_points; n++) + for (n = avx_iters * 8; n < num_points; n++) { // resample code for current tap local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); //Take into account that in multitap correlators, the shifts can be negative! - if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; + if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1); local_code_chip_index_ = local_code_chip_index_ % code_length_chips; _result[current_correlator_tap][n] = local_code[local_code_chip_index_]; } @@ -689,19 +697,21 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); - __VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; + __VOLK_ATTR_ALIGNED(16) + int32_t local_code_chip_index[4]; int32_t local_code_chip_index_; const int32x4_t zeros = vdupq_n_s32(0); const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); - int32x4_t local_code_chip_index_reg, aux_i, negatives, i; + int32x4_t local_code_chip_index_reg, aux_i, negatives, i; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; - __VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; + __VOLK_ATTR_ALIGNED(16) + const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f}; uint32x4_t igx; reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); - reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! + reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! float32x4_t n0 = vld1q_f32((float*)vec); for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) @@ -709,7 +719,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); indexn = n0; - for(n = 0; n < neon_iters; n++) + for (n = 0; n < neon_iters; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); @@ -725,7 +735,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul // fmod c = vmulq_f32(aux, reciprocal); - i = vcvtq_s32_f32(c); + i = vcvtq_s32_f32(c); cTrunc = vcvtq_f32_s32(i); base = vmulq_f32(cTrunc, code_length_chips_reg_f); aux = vsubq_f32(aux, base); @@ -737,13 +747,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); - for(k = 0; k < 4; ++k) + for (k = 0; k < 4; ++k) { _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; } indexn = vaddq_f32(indexn, fours); } - for(n = neon_iters * 4; n < num_points; n++) + for (n = neon_iters * 4; n < num_points; n++) { __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); // resample code for current tap diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h index e1d577c1e..b686b6c5d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_64f_accumulator_64f.h @@ -69,11 +69,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const unsigned int i; const double* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; + __VOLK_ATTR_ALIGNED(32) + double tempBuffer[4]; __m256d accumulator = _mm256_setzero_pd(); __m256d aVal = _mm256_setzero_pd(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm256_loadu_pd(aPtr); accumulator = _mm256_add_pd(accumulator, aVal); @@ -82,12 +83,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const _mm256_storeu_pd((double*)tempBuffer, accumulator); - for(i = 0; i < 4; ++i) + for (i = 0; i < 4; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 4); ++i) + for (i = 0; i < (num_points % 4); ++i) { returnValue += (*aPtr++); } @@ -100,7 +101,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points) +static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result, const double* inputBuffer, unsigned int num_points) { double returnValue = 0; const unsigned int sse_iters = num_points / 2; @@ -108,11 +109,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const unsigned int i; const double* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; + __VOLK_ATTR_ALIGNED(16) + double tempBuffer[2]; __m128d accumulator = _mm_setzero_pd(); __m128d aVal = _mm_setzero_pd(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_loadu_pd(aPtr); accumulator = _mm_add_pd(accumulator, aVal); @@ -121,12 +123,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const _mm_storeu_pd((double*)tempBuffer, accumulator); - for(i = 0; i < 2; ++i) + for (i = 0; i < 2; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 2); ++i) + for (i = 0; i < (num_points % 2); ++i) { returnValue += (*aPtr++); } @@ -138,13 +140,13 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points) +static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result, const double* inputBuffer, unsigned int num_points) { const double* aPtr = inputBuffer; double returnValue = 0; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { returnValue += (*aPtr++); } @@ -156,7 +158,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const #ifdef LV_HAVE_AVX #include -static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points) +static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result, const double* inputBuffer, unsigned int num_points) { double returnValue = 0; const unsigned int sse_iters = num_points / 4; @@ -164,11 +166,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d unsigned int i; const double* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; + __VOLK_ATTR_ALIGNED(32) + double tempBuffer[4]; __m256d accumulator = _mm256_setzero_pd(); __m256d aVal = _mm256_setzero_pd(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm256_load_pd(aPtr); accumulator = _mm256_add_pd(accumulator, aVal); @@ -177,12 +180,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d _mm256_store_pd((double*)tempBuffer, accumulator); - for(i = 0; i < 4; ++i) + for (i = 0; i < 4; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 4); ++i) + for (i = 0; i < (num_points % 4); ++i) { returnValue += (*aPtr++); } @@ -195,7 +198,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d #ifdef LV_HAVE_SSE3 #include -static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points) +static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result, const double* inputBuffer, unsigned int num_points) { double returnValue = 0; const unsigned int sse_iters = num_points / 2; @@ -203,11 +206,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const unsigned int i; const double* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; + __VOLK_ATTR_ALIGNED(16) + double tempBuffer[2]; __m128d accumulator = _mm_setzero_pd(); __m128d aVal = _mm_setzero_pd(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_load_pd(aPtr); accumulator = _mm_add_pd(accumulator, aVal); @@ -216,12 +220,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const _mm_store_pd((double*)tempBuffer, accumulator); - for(i = 0; i < 2; ++i) + for (i = 0; i < 2; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 2); ++i) + for (i = 0; i < (num_points % 2); ++i) { returnValue += (*aPtr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h index 8c2830cdc..9e141c6c4 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_accumulator_s8i.h @@ -70,11 +70,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch unsigned int i; const char* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char tempBuffer[16]; __m128i accumulator = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_lddqu_si128((__m128i*)aPtr); accumulator = _mm_add_epi8(accumulator, aVal); @@ -82,12 +83,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch } _mm_storeu_si128((__m128i*)tempBuffer, accumulator); - for(i = 0; i < 16; ++i) + for (i = 0; i < 16; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { returnValue += (*aPtr++); } @@ -104,7 +105,7 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c const char* aPtr = inputBuffer; char returnValue = 0; unsigned int number; - for(number = 0;number < num_points; number++) + for (number = 0; number < num_points; number++) { returnValue += (*aPtr++); } @@ -125,24 +126,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch const char* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char tempBuffer[16]; __m128i accumulator = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_load_si128((__m128i*)aPtr); accumulator = _mm_add_epi8(accumulator, aVal); aPtr += 16; } - _mm_store_si128((__m128i*)tempBuffer,accumulator); + _mm_store_si128((__m128i*)tempBuffer, accumulator); - for(i = 0; i < 16; ++i) + for (i = 0; i < 16; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { returnValue += (*aPtr++); } @@ -164,24 +166,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const ch const char* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char tempBuffer[32]; __m256i accumulator = _mm256_setzero_si256(); __m256i aVal = _mm256_setzero_si256(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm256_load_si256((__m256i*)aPtr); accumulator = _mm256_add_epi8(accumulator, aVal); aPtr += 32; } - _mm256_store_si256((__m256i*)tempBuffer,accumulator); + _mm256_store_si256((__m256i*)tempBuffer, accumulator); - for(i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { returnValue += (*aPtr++); } @@ -202,11 +205,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch unsigned int i; const char* aPtr = inputBuffer; - __VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char tempBuffer[32]; __m256i accumulator = _mm256_setzero_si256(); __m256i aVal = _mm256_setzero_si256(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm256_lddqu_si256((__m256i*)aPtr); accumulator = _mm256_add_epi8(accumulator, aVal); @@ -214,12 +218,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch } _mm256_storeu_si256((__m256i*)tempBuffer, accumulator); - for(i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { returnValue += tempBuffer[i]; } - for(i = 0; i < (num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { returnValue += (*aPtr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h index 1f053f239..2af8c55d9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_index_max_16u.h @@ -60,11 +60,11 @@ #ifdef LV_HAVE_AVX2 -#include +#include static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int avx2_iters = num_points / 32; unsigned int number; @@ -74,14 +74,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co char max = src0[0]; unsigned int index = 0; unsigned int mask; - __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char currentValuesBuffer[32]; __m256i maxValues, compareResults, currentValues; maxValues = _mm256_set1_epi8(max); - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - currentValues = _mm256_loadu_si256((__m256i*)inputPtr); + currentValues = _mm256_loadu_si256((__m256i*)inputPtr); compareResults = _mm256_cmpgt_epi8(maxValues, currentValues); mask = _mm256_movemask_epi8(compareResults); @@ -94,7 +95,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -108,9 +109,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co inputPtr += 32; } - for(i = 0; i<(num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -128,7 +129,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 32; unsigned int number; @@ -137,33 +138,34 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con char* inputPtr = (char*)src0; char max = src0[0]; unsigned int index = 0; - __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char currentValuesBuffer[32]; __m256i ones, compareResults, currentValues; __m128i compareResultslo, compareResultshi, maxValues, lo, hi; ones = _mm256_set1_epi8(0xFF); maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); + currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); lo = _mm256_castsi256_si128(currentValues); - hi = _mm256_extractf128_si256(currentValues,1); + hi = _mm256_extractf128_si256(currentValues, 1); compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultshi = _mm_cmpgt_epi8(maxValues, hi); //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h - compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); + compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1); if (!_mm256_testc_si256(compareResults, ones)) { _mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues); - for(i = 0; i < 32; i++) + for (i = 0; i < 32; i++) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -175,9 +177,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con inputPtr += 32; } - for(i = 0; i<(num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -195,7 +197,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -204,14 +206,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, char* inputPtr = (char*)src0; char max = src0[0]; unsigned int index = 0; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_lddqu_si128((__m128i*)inputPtr); + currentValues = _mm_lddqu_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); @@ -219,9 +222,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, { _mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues); - for(i = 0; i < 16; i++) + for (i = 0; i < 16; i++) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -233,9 +236,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, inputPtr += 16; } - for(i = 0; i<(num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -249,11 +252,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, #ifdef LV_HAVE_SSE2 -#include +#include static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -263,14 +266,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co char max = src0[0]; unsigned int index = 0; unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_loadu_si128((__m128i*)inputPtr); + currentValues = _mm_loadu_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); mask = _mm_movemask_epi8(compareResults); @@ -283,7 +287,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -297,9 +301,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co inputPtr += 16; } - for(i = 0; i<(num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -316,14 +320,14 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { char max = src0[0]; unsigned int index = 0; unsigned int i; - for(i = 1; i < num_points; ++i) + for (i = 1; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -337,11 +341,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c #ifdef LV_HAVE_AVX2 -#include +#include static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int avx2_iters = num_points / 32; unsigned int number; @@ -351,14 +355,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co char max = src0[0]; unsigned int index = 0; unsigned int mask; - __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char currentValuesBuffer[32]; __m256i maxValues, compareResults, currentValues; maxValues = _mm256_set1_epi8(max); - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { - currentValues = _mm256_load_si256((__m256i*)inputPtr); + currentValues = _mm256_load_si256((__m256i*)inputPtr); compareResults = _mm256_cmpgt_epi8(maxValues, currentValues); mask = _mm256_movemask_epi8(compareResults); @@ -371,7 +376,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -385,9 +390,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co inputPtr += 32; } - for(i = 0; i<(num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -405,7 +410,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 32; unsigned int number; @@ -414,19 +419,20 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con char* inputPtr = (char*)src0; char max = src0[0]; unsigned int index = 0; - __VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char currentValuesBuffer[32]; __m256i ones, compareResults, currentValues; __m128i compareResultslo, compareResultshi, maxValues, lo, hi; ones = _mm256_set1_epi8(0xFF); maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm256_load_si256((__m256i*)inputPtr); + currentValues = _mm256_load_si256((__m256i*)inputPtr); lo = _mm256_castsi256_si128(currentValues); - hi = _mm256_extractf128_si256(currentValues,1); + hi = _mm256_extractf128_si256(currentValues, 1); compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultshi = _mm_cmpgt_epi8(maxValues, hi); @@ -438,9 +444,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con { _mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues); - for(i = 0; i < 32; i++) + for (i = 0; i < 32; i++) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -452,9 +458,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con inputPtr += 32; } - for(i = 0; i<(num_points % 32); ++i) + for (i = 0; i < (num_points % 32); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -472,7 +478,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -481,14 +487,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, char* inputPtr = (char*)src0; char max = src0[0]; unsigned int index = 0; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_load_si128((__m128i*)inputPtr); + currentValues = _mm_load_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); @@ -496,9 +503,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, { _mm_store_si128((__m128i*)¤tValuesBuffer, currentValues); - for(i = 0; i < 16; i++) + for (i = 0; i < 16; i++) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -510,9 +517,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, inputPtr += 16; } - for(i = 0; i<(num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; @@ -530,7 +537,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -540,14 +547,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co char max = src0[0]; unsigned int index = 0; unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_load_si128((__m128i*)inputPtr); + currentValues = _mm_load_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); mask = _mm_movemask_epi8(compareResults); @@ -560,7 +568,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { index = inputPtr - basePtr + i; max = currentValuesBuffer[i]; @@ -574,9 +582,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co inputPtr += 16; } - for(i = 0; i<(num_points % 16); ++i) + for (i = 0; i < (num_points % 16); ++i) { - if(src0[i] > max) + if (src0[i] > max) { index = i; max = src0[i]; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h index 109c4f779..d748281c3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_max_s8i.h @@ -63,21 +63,22 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int avx_iters = num_points / 32; unsigned int number; unsigned int i; char* inputPtr = (char*)src0; char max = src0[0]; - __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char maxValuesBuffer[32]; __m256i maxValues, compareResults, currentValues; maxValues = _mm256_set1_epi8(max); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { - currentValues = _mm256_loadu_si256((__m256i*)inputPtr); + currentValues = _mm256_loadu_si256((__m256i*)inputPtr); compareResults = _mm256_max_epi8(maxValues, currentValues); maxValues = compareResults; inputPtr += 32; @@ -85,17 +86,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0 _mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues); - for(i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { - if(maxValuesBuffer[i] > max) + if (maxValuesBuffer[i] > max) { max = maxValuesBuffer[i]; } } - for(i = avx_iters * 32; i < num_points; ++i) + for (i = avx_iters * 32; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -112,21 +113,22 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0 static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; unsigned int i; char* inputPtr = (char*)src0; char max = src0[0]; - __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char maxValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_loadu_si128((__m128i*)inputPtr); + currentValues = _mm_loadu_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); inputPtr += 16; @@ -134,17 +136,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); - for(i = 0; i < 16; ++i) + for (i = 0; i < 16; ++i) { - if(maxValuesBuffer[i] > max) + if (maxValuesBuffer[i] > max) { max = maxValuesBuffer[i]; } } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -157,11 +159,11 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr #ifdef LV_HAVE_SSE2 -#include +#include static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -169,14 +171,15 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0 char* inputPtr = (char*)src0; char max = src0[0]; unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_loadu_si128((__m128i*)inputPtr); + currentValues = _mm_loadu_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); mask = _mm_movemask_epi8(compareResults); @@ -189,7 +192,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0 { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { max = currentValuesBuffer[i]; } @@ -202,9 +205,9 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0 inputPtr += 16; } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -220,13 +223,13 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0 static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { char max = src0[0]; unsigned int i; - for(i = 1; i < num_points; ++i) + for (i = 1; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -243,21 +246,22 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; unsigned int i; char* inputPtr = (char*)src0; char max = src0[0]; - __VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char maxValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_load_si128((__m128i*)inputPtr); + currentValues = _mm_load_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); inputPtr += 16; @@ -265,17 +269,17 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr _mm_store_si128((__m128i*)maxValuesBuffer, maxValues); - for(i = 0; i < 16; ++i) + for (i = 0; i < 16; ++i) { - if(maxValuesBuffer[i] > max) + if (maxValuesBuffer[i] > max) { max = maxValuesBuffer[i]; } } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -292,39 +296,40 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int avx_iters = num_points / 32; unsigned int number; unsigned int i; char* inputPtr = (char*)src0; char max = src0[0]; - __VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; + __VOLK_ATTR_ALIGNED(32) + char maxValuesBuffer[32]; __m256i maxValues, compareResults, currentValues; maxValues = _mm256_set1_epi8(max); - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { - currentValues = _mm256_load_si256((__m256i*)inputPtr); + currentValues = _mm256_load_si256((__m256i*)inputPtr); compareResults = _mm256_max_epi8(maxValues, currentValues); - maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults); + maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults); inputPtr += 32; } _mm256_store_si256((__m256i*)maxValuesBuffer, maxValues); - for(i = 0; i < 32; ++i) + for (i = 0; i < 32; ++i) { - if(maxValuesBuffer[i] > max) + if (maxValuesBuffer[i] > max) { max = maxValuesBuffer[i]; } } - for(i = avx_iters * 32; i < num_points; ++i) + for (i = avx_iters * 32; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } @@ -341,7 +346,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0 static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0, unsigned int num_points) { - if(num_points > 0) + if (num_points > 0) { const unsigned int sse_iters = num_points / 16; unsigned int number; @@ -349,14 +354,15 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0 char* inputPtr = (char*)src0; char max = src0[0]; unsigned short mask; - __VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; + __VOLK_ATTR_ALIGNED(16) + char currentValuesBuffer[16]; __m128i maxValues, compareResults, currentValues; maxValues = _mm_set1_epi8(max); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { - currentValues = _mm_load_si128((__m128i*)inputPtr); + currentValues = _mm_load_si128((__m128i*)inputPtr); compareResults = _mm_cmpgt_epi8(maxValues, currentValues); mask = _mm_movemask_epi8(compareResults); @@ -369,7 +375,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0 { if ((mask & 1) == 1) { - if(currentValuesBuffer[i] > max) + if (currentValuesBuffer[i] > max) { max = currentValuesBuffer[i]; } @@ -382,9 +388,9 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0 inputPtr += 16; } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { - if(src0[i] > max) + if (src0[i] > max) { max = src0[i]; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h index 3854319fd..4d25cf923 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8i_x2_add_8i.h @@ -72,21 +72,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a __m128i aVal, bVal, cVal; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_loadu_si128((__m128i*)aPtr); bVal = _mm_loadu_si128((__m128i*)bPtr); cVal = _mm_add_epi8(aVal, bVal); - _mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container + _mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container aPtr += 16; bPtr += 16; cPtr += 16; } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { *cPtr++ = (*aPtr++) + (*bPtr++); } @@ -108,21 +108,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* a __m256i aVal, bVal, cVal; - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { aVal = _mm256_loadu_si256((__m256i*)aPtr); bVal = _mm256_loadu_si256((__m256i*)bPtr); cVal = _mm256_add_epi8(aVal, bVal); - _mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container + _mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container aPtr += 32; bPtr += 32; cPtr += 32; } - for(i = avx_iters * 32; i < num_points; ++i) + for (i = avx_iters * 32; i < num_points; ++i) { *cPtr++ = (*aPtr++) + (*bPtr++); } @@ -139,7 +139,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char* const char* bPtr = bVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = (*aPtr++) + (*bPtr++); } @@ -161,21 +161,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a __m128i aVal, bVal, cVal; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { aVal = _mm_load_si128((__m128i*)aPtr); bVal = _mm_load_si128((__m128i*)bPtr); cVal = _mm_add_epi8(aVal, bVal); - _mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container + _mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container aPtr += 16; bPtr += 16; cPtr += 16; } - for(i = sse_iters * 16; i < num_points; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { *cPtr++ = (*aPtr++) + (*bPtr++); } @@ -197,21 +197,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* a __m256i aVal, bVal, cVal; - for(number = 0; number < avx_iters; number++) + for (number = 0; number < avx_iters; number++) { aVal = _mm256_load_si256((__m256i*)aPtr); bVal = _mm256_load_si256((__m256i*)bPtr); cVal = _mm256_add_epi8(aVal, bVal); - _mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container + _mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container aPtr += 32; bPtr += 32; cPtr += 32; } - for(i = avx_iters * 32; i < num_points; ++i) + for (i = avx_iters * 32; i < num_points; ++i) { *cPtr++ = (*aPtr++) + (*bPtr++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h index 830128a83..177b1114d 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_conjugate_8ic.h @@ -111,10 +111,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const tmp = _mm256_xor_ps(tmp, conjugator1); tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); - tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); + tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h - tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); + tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); _mm256_storeu_ps((float*)c, tmp); a += 16; @@ -155,7 +155,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con { *c++ = lv_conj(*a++); } - } #endif /* LV_HAVE_SSSE3 */ @@ -188,7 +187,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons { *c++ = lv_conj(*a++); } - } #endif /* LV_HAVE_SSE3 */ @@ -201,7 +199,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con const lv_8sc_t* aPtr = aVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = lv_conj(*aPtr++); } @@ -230,10 +228,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const tmp = _mm256_xor_ps(tmp, conjugator1); tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); - tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); + tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h - tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); + tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1)); _mm256_store_ps((float*)c, tmp); a += 16; @@ -336,7 +334,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons { *c++ = lv_conj(*a++); } - } #endif /* LV_HAVE_SSE3 */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h index 7152b0f29..d9dd67716 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_magnitude_squared_8i.h @@ -78,23 +78,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); - avectorlo = _mm_unpacklo_epi8 (avector, zero); - avectorhi = _mm_unpackhi_epi8 (avector, zero); - avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); - avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); - aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); + avectorlo = _mm_unpacklo_epi8(avector, zero); + avectorhi = _mm_unpackhi_epi8(avector, zero); + avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo); + avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi); + aadded = _mm_hadd_epi16(avectorlomult, avectorhimult); complexVectorPtr += 16; bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); - bvectorlo = _mm_unpacklo_epi8 (bvector, zero); - bvectorhi = _mm_unpackhi_epi8 (bvector, zero); - bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); - bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); - badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); + bvectorlo = _mm_unpacklo_epi8(bvector, zero); + bvectorhi = _mm_unpackhi_epi8(bvector, zero); + bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo); + bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi); + badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult); complexVectorPtr += 16; @@ -162,11 +162,11 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude const char* complexVectorPtr = (char*)complexVector; char* magnitudeVectorPtr = magnitudeVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { const char real = *complexVectorPtr++; const char imag = *complexVectorPtr++; - *magnitudeVectorPtr++ = (real*real) + (imag*imag); + *magnitudeVectorPtr++ = (real * real) + (imag * imag); } } #endif /* LV_HAVE_GENERIC */ @@ -192,23 +192,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { avector = _mm_load_si128((__m128i*)complexVectorPtr); - avectorlo = _mm_unpacklo_epi8 (avector, zero); - avectorhi = _mm_unpackhi_epi8 (avector, zero); - avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); - avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); - aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); + avectorlo = _mm_unpacklo_epi8(avector, zero); + avectorhi = _mm_unpackhi_epi8(avector, zero); + avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo); + avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi); + aadded = _mm_hadd_epi16(avectorlomult, avectorhimult); complexVectorPtr += 16; bvector = _mm_load_si128((__m128i*)complexVectorPtr); - bvectorlo = _mm_unpacklo_epi8 (bvector, zero); - bvectorhi = _mm_unpackhi_epi8 (bvector, zero); - bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); - bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); - badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); + bvectorlo = _mm_unpacklo_epi8(bvector, zero); + bvectorhi = _mm_unpackhi_epi8(bvector, zero); + bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo); + bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi); + badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult); complexVectorPtr += 16; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h index 21b1abb1b..3c949b3db 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_s8ic_multiply_8ic.h @@ -80,7 +80,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, imagy = _mm_and_si128(imagy, mult1); realy = _mm_and_si128(y, mult1); - for(; number < sse_iters; number++) + for (; number < sse_iters; number++) { x = _mm_lddqu_si128((__m128i*)a); @@ -111,7 +111,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector, { *c++ = (*a++) * scalar; } - } #endif /* LV_HAVE_SSE3 */ @@ -173,7 +172,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, imagy = _mm_and_si128(imagy, mult1); realy = _mm_and_si128(y, mult1); - for(; number < sse_iters; number++) + for (; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); @@ -204,7 +203,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector, { *c++ = (*a++) * scalar; } - } #endif /* LV_HAVE_SSE3 */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h index e9633d682..88a689f8b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_dot_prod_8ic.h @@ -75,17 +75,17 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co *cPtr += (*aPtr++) * (*bPtr++); }*/ - char * res = (char*) result; - char * in = (char*) in_a; - char * tp = (char*) in_b; - unsigned int n_2_ccomplex_blocks = num_points/2; + char* res = (char*)result; + char* in = (char*)in_a; + char* tp = (char*)in_b; + unsigned int n_2_ccomplex_blocks = num_points / 2; unsigned int isodd = num_points & 1; - char sum0[2] = {0,0}; - char sum1[2] = {0,0}; + char sum0[2] = {0, 0}; + char sum1[2] = {0, 0}; unsigned int i = 0; - for(i = 0; i < n_2_ccomplex_blocks; ++i) + for (i = 0; i < n_2_ccomplex_blocks; ++i) { sum0[0] += in[0] * tp[0] - in[1] * tp[1]; sum0[1] += in[0] * tp[1] + in[1] * tp[0]; @@ -100,7 +100,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co res[1] = sum0[1] + sum1[1]; // Cleanup if we had an odd number of points - for(i = 0; i < isodd; ++i) + for (i = 0; i < isodd; ++i) { *result += in_a[num_points - 1] * in_b[num_points - 1]; } @@ -115,13 +115,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(char)); + memset(&dotProduct, 0x0, 2 * sizeof(char)); unsigned int number; unsigned int i; const lv_8sc_t* a = in_a; const lv_8sc_t* b = in_b; - const unsigned int sse_iters = num_points/8; + const unsigned int sse_iters = num_points / 8; if (sse_iters > 0) { @@ -131,7 +131,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_loadu_si128((__m128i*)a); y = _mm_loadu_si128((__m128i*)b); @@ -165,9 +165,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con totalc = _mm_or_si128(realcacc, imagcacc); - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t dotProductVector[8]; - _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector + _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector for (i = 0; i < 8; ++i) { @@ -192,13 +193,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(char)); + memset(&dotProduct, 0x0, 2 * sizeof(char)); unsigned int number; unsigned int i; const lv_8sc_t* a = in_a; const lv_8sc_t* b = in_b; - const unsigned int sse_iters = num_points/8; + const unsigned int sse_iters = num_points / 8; if (sse_iters > 0) { @@ -208,7 +209,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_lddqu_si128((__m128i*)a); y = _mm_lddqu_si128((__m128i*)b); @@ -236,13 +237,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c b += 8; } - imagcacc = _mm_slli_si128 (imagcacc, 1); + imagcacc = _mm_slli_si128(imagcacc, 1); - totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); + totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1); - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t dotProductVector[8]; - _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector + _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector for (i = 0; i < 8; ++i) { @@ -267,13 +269,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(char)); + memset(&dotProduct, 0x0, 2 * sizeof(char)); unsigned int number; unsigned int i; const lv_8sc_t* a = in_a; const lv_8sc_t* b = in_b; - const unsigned int sse_iters = num_points/8; + const unsigned int sse_iters = num_points / 8; if (sse_iters > 0) { @@ -283,7 +285,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); @@ -317,9 +319,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con totalc = _mm_or_si128(realcacc, imagcacc); - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t dotProductVector[8]; - _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector + _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector for (i = 0; i < 8; ++i) { @@ -343,7 +346,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - memset(&dotProduct, 0x0, 2*sizeof(char)); + memset(&dotProduct, 0x0, 2 * sizeof(char)); unsigned int number; unsigned int i; const lv_8sc_t* a = in_a; @@ -359,7 +362,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c realcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128(); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); @@ -387,13 +390,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c b += 8; } - imagcacc = _mm_slli_si128 (imagcacc, 1); + imagcacc = _mm_slli_si128(imagcacc, 1); - totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); + totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1); - __VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t dotProductVector[8]; - _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector + _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector for (i = 0; i < 8; ++i) { @@ -438,22 +442,23 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, cons static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) { lv_8sc_t dotProduct; - dotProduct = lv_cmake(0,0); - *result = lv_cmake(0,0); + dotProduct = lv_cmake(0, 0); + *result = lv_cmake(0, 0); const lv_8sc_t* a = in_a; const lv_8sc_t* b = in_b; // for 2-lane vectors, 1st lane holds the real part, // 2nd lane holds the imaginary part int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag; - __VOLK_ATTR_ALIGNED(16) lv_8sc_t accum_result[8] = { lv_cmake(0,0) }; + __VOLK_ATTR_ALIGNED(16) + lv_8sc_t accum_result[8] = {lv_cmake(0, 0)}; accumulator.val[0] = vdup_n_s8(0); accumulator.val[1] = vdup_n_s8(0); unsigned int number; const unsigned int neon_iters = num_points / 8; - for(number = 0; number < neon_iters; ++number) + for (number = 0; number < neon_iters; ++number) { a_val = vld2_s8((const int8_t*)a); b_val = vld2_s8((const int8_t*)b); @@ -478,7 +483,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const b += 8; } vst2_s8((int8_t*)accum_result, accumulator); - for(number = 0; number < 8; ++number) + for (number = 0; number < 8; ++number) { *result += accum_result[number]; } @@ -490,6 +495,6 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const *result += dotProduct; } -#endif /* LV_HAVE_NEON */ +#endif /* LV_HAVE_NEON */ #endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_H*/ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h index 1b3fd5532..0d8c1d6b3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8ic_x2_multiply_8ic.h @@ -75,7 +75,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_loadu_si128((__m128i*)a); y = _mm_loadu_si128((__m128i*)b); @@ -133,7 +133,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector, _mm_setzero_si128(); mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_lddqu_si128((__m128i*)a); y = _mm_lddqu_si128((__m128i*)b); @@ -181,7 +181,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c const lv_8sc_t* bPtr = bVector; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = (*aPtr++) * (*bPtr++); } @@ -204,7 +204,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); @@ -228,7 +228,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co imagc = _mm_and_si128(imagc, mult1); imagc = _mm_slli_si128(imagc, 1); - totalc = _mm_or_si128 (realc, imagc); + totalc = _mm_or_si128(realc, imagc); _mm_store_si128((__m128i*)c, totalc); @@ -262,7 +262,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector, _mm_setzero_si128(); mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h index 8457b7f14..e953954f0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_8u_x2_multiply_8u.h @@ -72,7 +72,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c const unsigned char* a = aChar; const unsigned char* b = bChar; - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { x = _mm256_loadu_si256((__m256i*)a); y = _mm256_loadu_si256((__m256i*)b); @@ -101,7 +101,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c c += 32; } - for (i = avx2_iters * 32; i < num_points ; ++i) + for (i = avx2_iters * 32; i < num_points; ++i) { *c++ = (*a++) * (*b++); } @@ -123,7 +123,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c const unsigned char* a = aChar; const unsigned char* b = bChar; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_lddqu_si128((__m128i*)a); y = _mm_lddqu_si128((__m128i*)b); @@ -152,7 +152,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c c += 16; } - for (i = sse_iters * 16; i < num_points ; ++i) + for (i = sse_iters * 16; i < num_points; ++i) { *c++ = (*a++) * (*b++); } @@ -168,7 +168,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar, const unsigned char* bPtr = bChar; unsigned int number; - for(number = 0; number < num_points; number++) + for (number = 0; number < num_points; number++) { *cPtr++ = (*aPtr++) * (*bPtr++); } @@ -189,7 +189,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c const unsigned char* a = aChar; const unsigned char* b = bChar; - for(number = 0; number < sse_iters; number++) + for (number = 0; number < sse_iters; number++) { x = _mm_load_si128((__m128i*)a); y = _mm_load_si128((__m128i*)b); @@ -240,7 +240,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c const unsigned char* a = aChar; const unsigned char* b = bChar; - for(number = 0; number < avx2_iters; number++) + for (number = 0; number < avx2_iters; number++) { x = _mm256_load_si256((__m256i*)a); y = _mm256_load_si256((__m256i*)b); @@ -269,7 +269,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c c += 32; } - for (i = avx2_iters * 32; i < num_points ; ++i) + for (i = avx2_iters * 32; i < num_points; ++i) { *c++ = (*a++) * (*b++); } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h index e7e1153e3..d6d58e4d0 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincos_32fc.h @@ -71,9 +71,9 @@ #include /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ -static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int sse_iters = num_points / 4; unsigned int number = 0; @@ -84,44 +84,44 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl __m128i emm0, emm2, emm4; /* declare some SSE constants */ - static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - static const int _pi32_1[4] = { 1, 1, 1, 1 }; - static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; - static const int _pi32_2[4] = { 2, 2, 2, 2}; - static const int _pi32_4[4] = { 4, 4, 4, 4}; + static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + static const int _pi32_1[4] = {1, 1, 1, 1}; + static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1}; + static const int _pi32_2[4] = {2, 2, 2, 2}; + static const int _pi32_4[4] = {4, 4, 4, 4}; - static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625}; + static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f}; - float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; - float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc }; + float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc}; + float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc}; four_phases_reg = _mm_load_ps(four_phases); const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc); - for(;number < sse_iters; number++) + for (; number < sse_iters; number++) { x = four_phases_reg; sign_bit_sin = x; /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); + x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); + sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask); /* scale by 4/Pi */ - y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); + y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI); /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); @@ -145,9 +145,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl /* The magic pass: "Extended precision modular arithmetic” x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m128*)_ps_minus_cephes_DP1; - xmm2 = *(__m128*)_ps_minus_cephes_DP2; - xmm3 = *(__m128*)_ps_minus_cephes_DP3; + xmm1 = *(__m128 *)_ps_minus_cephes_DP1; + xmm2 = *(__m128 *)_ps_minus_cephes_DP2; + xmm3 = *(__m128 *)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); @@ -163,25 +163,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x,x); - y = *(__m128*)_ps_coscof_p0; + __m128 z = _mm_mul_ps(x, x); + y = *(__m128 *)_ps_coscof_p0; y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); + y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1); y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); + y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); - __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); + __m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5); y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(__m128*)_ps_1); + y = _mm_add_ps(y, *(__m128 *)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - __m128 y2 = *(__m128*)_ps_sincof_p0; + __m128 y2 = *(__m128 *)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); + y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); + y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); @@ -190,11 +190,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); + y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); /* update the sign */ sine = _mm_xor_ps(xmm1, sign_bit_sin); @@ -202,19 +202,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl /* write the output */ aux = _mm_unpacklo_ps(cosine, sine); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(cosine, sine); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg); } _phase = _phase + phase_inc * (sse_iters * 4); - for(number = sse_iters * 4; number < num_points; number++) + for (number = sse_iters * 4; number < num_points; number++) { - *bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase)) ); + *bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase))); _phase += phase_inc; } (*phase) = _phase; @@ -227,9 +227,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl #include /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ -static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int sse_iters = num_points / 4; unsigned int number = 0; @@ -241,44 +241,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl __m128i emm0, emm2, emm4; /* declare some SSE constants */ - __VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(16) + static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; - __VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_1[4] = {1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_2[4] = {2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(16) + static const int _pi32_4[4] = {4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(16) + static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f}; - __VOLK_ATTR_ALIGNED(16) float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; - __VOLK_ATTR_ALIGNED(16) float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc }; + __VOLK_ATTR_ALIGNED(16) + float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc}; + __VOLK_ATTR_ALIGNED(16) + float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc}; four_phases_reg = _mm_load_ps(four_phases); const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc); - for(;number < sse_iters; number++) + for (; number < sse_iters; number++) { x = four_phases_reg; sign_bit_sin = x; /* take the absolute value */ - x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); + x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); + sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask); /* scale by 4/Pi */ - y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); + y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI); /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); @@ -302,9 +322,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl /* The magic pass: "Extended precision modular arithmetic” x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m128*)_ps_minus_cephes_DP1; - xmm2 = *(__m128*)_ps_minus_cephes_DP2; - xmm3 = *(__m128*)_ps_minus_cephes_DP3; + xmm1 = *(__m128 *)_ps_minus_cephes_DP1; + xmm2 = *(__m128 *)_ps_minus_cephes_DP2; + xmm3 = *(__m128 *)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); @@ -320,25 +340,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - __m128 z = _mm_mul_ps(x,x); - y = *(__m128*)_ps_coscof_p0; + __m128 z = _mm_mul_ps(x, x); + y = *(__m128 *)_ps_coscof_p0; y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); + y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1); y = _mm_mul_ps(y, z); - y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); + y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); - __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); + __m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5); y = _mm_sub_ps(y, tmp); - y = _mm_add_ps(y, *(__m128*)_ps_1); + y = _mm_add_ps(y, *(__m128 *)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - __m128 y2 = *(__m128*)_ps_sincof_p0; + __m128 y2 = *(__m128 *)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); + y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); - y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); + y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); @@ -347,11 +367,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); - y2 = _mm_sub_ps(y2,ysin2); + y2 = _mm_sub_ps(y2, ysin2); y = _mm_sub_ps(y, ysin1); - xmm1 = _mm_add_ps(ysin1,ysin2); - xmm2 = _mm_add_ps(y,y2); + xmm1 = _mm_add_ps(ysin1, ysin2); + xmm2 = _mm_add_ps(y, y2); /* update the sign */ sine = _mm_xor_ps(xmm1, sign_bit_sin); @@ -359,19 +379,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl /* write the output */ aux = _mm_unpacklo_ps(cosine, sine); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(cosine, sine); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg); } _phase = _phase + phase_inc * (sse_iters * 4); - for(number = sse_iters * 4; number < num_points; number++) + for (number = sse_iters * 4; number < num_points; number++) { - *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; @@ -382,13 +402,13 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl #ifdef LV_HAVE_GENERIC -static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { float _phase = (*phase); unsigned int i; - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { - *out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + *out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; @@ -400,7 +420,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const f #ifdef LV_HAVE_GENERIC #include #include -static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { float _in, s, c; unsigned int i; @@ -413,12 +433,12 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co const int32_t diffbits = bitlength - Nbits; uint32_t ux; float _phase = (*phase); - for(i = 0; i < num_points; i++) + for (i = 0; i < num_points; i++) { _in = _phase; d = (int32_t)floor(_in / TWO_PI + 0.5); _in -= d * TWO_PI; - x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI); + x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI); ux = x; sin_index = ux >> diffbits; @@ -428,7 +448,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co cos_index = ux >> diffbits; c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1]; - *out++ = lv_cmake((float)c, (float)s ); + *out++ = lv_cmake((float)c, (float)s); _phase += phase_inc; } (*phase) = _phase; @@ -441,9 +461,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co #include /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ -static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int avx_iters = num_points / 8; unsigned int number = 0; @@ -456,44 +476,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl __m128 aux, c1, s1; /* declare some AXX2 constants */ - __VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(32) + static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(32) + static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 }; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - __VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; - __VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; + __VOLK_ATTR_ALIGNED(32) + float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc}; + __VOLK_ATTR_ALIGNED(32) + float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc}; eight_phases_reg = _mm256_load_ps(eight_phases); const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); - for(;number < avx_iters; number++) + for (; number < avx_iters; number++) { x = eight_phases_reg; sign_bit_sin = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask); + x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask); + sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask); /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI); + y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI); /* store the integer part of y in emm2 */ emm2 = _mm256_cvttps_epi32(y); @@ -517,9 +557,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl /* The magic pass: "Extended precision modular arithmetic” x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m256*)_ps_minus_cephes_DP1; - xmm2 = *(__m256*)_ps_minus_cephes_DP2; - xmm3 = *(__m256*)_ps_minus_cephes_DP3; + xmm1 = *(__m256 *)_ps_minus_cephes_DP1; + xmm2 = *(__m256 *)_ps_minus_cephes_DP2; + xmm3 = *(__m256 *)_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -536,24 +576,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m256 z = _mm256_mul_ps(x, x); - y = *(__m256*)_ps_coscof_p0; + y = *(__m256 *)_ps_coscof_p0; y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1); + y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2); + y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5); + __m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5); y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(__m256*)_ps_1); + y = _mm256_add_ps(y, *(__m256 *)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - __m256 y2 = *(__m256*)_ps_sincof_p0; + __m256 y2 = *(__m256 *)_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1); + y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2); + y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -576,27 +616,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl s1 = _mm256_extractf128_ps(sine, 0); c1 = _mm256_extractf128_ps(cosine, 0); aux = _mm_unpacklo_ps(c1, s1); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(c1, s1); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; s1 = _mm256_extractf128_ps(sine, 1); c1 = _mm256_extractf128_ps(cosine, 1); aux = _mm_unpacklo_ps(c1, s1); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(c1, s1); - _mm_store_ps((float*)bPtr, aux); + _mm_store_ps((float *)bPtr, aux); bPtr += 2; eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg); } _mm256_zeroupper(); _phase = _phase + phase_inc * (avx_iters * 8); - for(number = avx_iters * 8; number < num_points; number++) + for (number = avx_iters * 8; number < num_points; number++) { - out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; @@ -609,9 +649,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl #include /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ -static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int avx_iters = num_points / 8; unsigned int number = 0; @@ -624,44 +664,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl __m128 aux, c1, s1; /* declare some AXX2 constants */ - __VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; - __VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; + __VOLK_ATTR_ALIGNED(32) + static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000}; + __VOLK_ATTR_ALIGNED(32) + static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000}; - __VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 }; - __VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 }; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2}; + __VOLK_ATTR_ALIGNED(32) + static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4}; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; - __VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f}; + __VOLK_ATTR_ALIGNED(32) + static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}; - __VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; - __VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; + __VOLK_ATTR_ALIGNED(32) + float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc}; + __VOLK_ATTR_ALIGNED(32) + float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc}; eight_phases_reg = _mm256_load_ps(eight_phases); const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); - for(;number < avx_iters; number++) + for (; number < avx_iters; number++) { x = eight_phases_reg; sign_bit_sin = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask); + x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask); + sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask); /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI); + y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI); /* store the integer part of y in emm2 */ emm2 = _mm256_cvttps_epi32(y); @@ -685,9 +745,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl /* The magic pass: "Extended precision modular arithmetic” x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(__m256*)_ps_minus_cephes_DP1; - xmm2 = *(__m256*)_ps_minus_cephes_DP2; - xmm3 = *(__m256*)_ps_minus_cephes_DP3; + xmm1 = *(__m256 *)_ps_minus_cephes_DP1; + xmm2 = *(__m256 *)_ps_minus_cephes_DP2; + xmm3 = *(__m256 *)_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -704,24 +764,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m256 z = _mm256_mul_ps(x, x); - y = *(__m256*)_ps_coscof_p0; + y = *(__m256 *)_ps_coscof_p0; y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1); + y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2); + y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - __m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5); + __m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5); y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(__m256*)_ps_1); + y = _mm256_add_ps(y, *(__m256 *)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - __m256 y2 = *(__m256*)_ps_sincof_p0; + __m256 y2 = *(__m256 *)_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1); + y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2); + y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -744,27 +804,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl s1 = _mm256_extractf128_ps(sine, 0); c1 = _mm256_extractf128_ps(cosine, 0); aux = _mm_unpacklo_ps(c1, s1); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(c1, s1); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; s1 = _mm256_extractf128_ps(sine, 1); c1 = _mm256_extractf128_ps(cosine, 1); aux = _mm_unpacklo_ps(c1, s1); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; aux = _mm_unpackhi_ps(c1, s1); - _mm_storeu_ps((float*)bPtr, aux); + _mm_storeu_ps((float *)bPtr, aux); bPtr += 2; eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg); } _mm256_zeroupper(); _phase = _phase + phase_inc * (avx_iters * 8); - for(number = avx_iters * 8; number < num_points; number++) + for (number = avx_iters * 8; number < num_points; number++) { - out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; @@ -777,15 +837,17 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl #include /* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ -static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) +static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points) { - lv_32fc_t* bPtr = out; + lv_32fc_t *bPtr = out; const unsigned int neon_iters = num_points / 4; float _phase = (*phase); - __VOLK_ATTR_ALIGNED(16) float32_t four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; + __VOLK_ATTR_ALIGNED(16) + float32_t four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc}; float four_inc = 4 * phase_inc; - __VOLK_ATTR_ALIGNED(16) float32_t four_phases_inc[4] = { four_inc, four_inc, four_inc, four_inc }; + __VOLK_ATTR_ALIGNED(16) + float32_t four_phases_inc[4] = {four_inc, four_inc, four_inc, four_inc}; float32x4_t four_phases_reg = vld1q_f32(four_phases); float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc); @@ -808,7 +870,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos; - for(;number < neon_iters; number++) + for (; number < neon_iters; number++) { x = four_phases_reg; @@ -847,7 +909,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa /* Evaluate the first polynom (0 <= x <= Pi/4) in y1, and the second polynom (Pi/4 <= x <= 0) in y2 */ - z = vmulq_f32(x,x); + z = vmulq_f32(x, x); y1 = vmulq_n_f32(z, c_coscof_p0); y2 = vmulq_n_f32(z, c_sincof_p0); @@ -871,16 +933,16 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa result.val[1] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys); result.val[0] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc)); - vst2q_f32((float32_t*)bPtr, result); + vst2q_f32((float32_t *)bPtr, result); bPtr += 4; four_phases_reg = vaddq_f32(four_phases_reg, four_phases_inc_reg); } _phase = _phase + phase_inc * (neon_iters * 4); - for(number = neon_iters * 4; number < num_points; number++) + for (number = neon_iters * 4; number < num_points; number++) { - *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); + *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase)); _phase += phase_inc; } (*phase) = _phase; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h index 07d3bf5d2..e4f7c942f 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_s32f_sincospuppet_32fc.h @@ -49,7 +49,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic(lv_32fc_t* out, c volk_gnsssdr_s32f_sincos_32fc_generic(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_GENERIC */ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_GENERIC @@ -60,7 +60,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic_fxpt(lv_32fc_t* o volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_GENERIC */ +#endif /* LV_HAVE_GENERIC */ #ifdef LV_HAVE_SSE2 @@ -70,7 +70,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_sse2(lv_32fc_t* out, co phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_a_sse2(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_SSE2 */ +#endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_SSE2 @@ -80,7 +80,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_sse2(lv_32fc_t* out, co phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_u_sse2(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_SSE2 */ +#endif /* LV_HAVE_SSE2 */ #ifdef LV_HAVE_AVX2 @@ -90,7 +90,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_avx2(lv_32fc_t* out, co phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_a_avx2(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_AVX2 @@ -100,7 +100,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_avx2(lv_32fc_t* out, co phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_u_avx2(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_AVX2 */ +#endif /* LV_HAVE_AVX2 */ #ifdef LV_HAVE_NEON @@ -110,6 +110,6 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_neon(lv_32fc_t* out, cons phase[0] = 3; volk_gnsssdr_s32f_sincos_32fc_neon(out, phase_inc, phase, num_points); } -#endif /* LV_HAVE_NEON */ +#endif /* LV_HAVE_NEON */ -#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */ +#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h index 5861d052f..733ca74bb 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/kernel_tests.h @@ -38,32 +38,31 @@ // for puppets we need to get all the func_variants for the puppet and just // keep track of the actual function name to write to results -#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\ - volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\ - std::string(#puppet_master_func), test_params) +#define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \ + volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \ + std::string(#puppet_master_func), test_params) -#define VOLK_INIT_TEST(func, test_params)\ - volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\ - test_params) +#define VOLK_INIT_TEST(func, test_params) \ + volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \ + test_params) #define QA(test) test_cases.push_back(test); std::vector init_test_list(volk_gnsssdr_test_params_t test_params) { - // Some kernels need a lower tolerance volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); volk_gnsssdr_test_params_t test_params_int1 = volk_gnsssdr_test_params_t(1, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); // some others need more iterations ***** ADDED BY GNSS-SDR volk_gnsssdr_test_params_t test_params_more_iters = volk_gnsssdr_test_params_t(test_params.tol(), test_params.scalar(), - test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex()); // ... or more tolerance ***** ADDED BY GNSS-SDR volk_gnsssdr_test_params_t test_params_int16 = volk_gnsssdr_test_params_t(16, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); volk_gnsssdr_test_params_t test_params_inacc2 = volk_gnsssdr_test_params_t(2e-1, test_params.scalar(), - test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); + test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); std::vector test_cases; @@ -98,8 +97,7 @@ std::vector init_test_list(volk_gnsssdr_test_params_t QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16)) QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16)) QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1)) - QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1)) - ; + QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1)); return test_cases; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc index 35e60b2f4..18a4919e2 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.cc @@ -17,38 +17,39 @@ */ #include "qa_utils.h" -#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t -#include "volk_gnsssdr/volk_gnsssdr_malloc.h" // for volk_gnsssdr_free, volk_gnsssdr_malloc -#include // for auto_any_base -#include // for lexical_cast -#include // for char_separator -#include // for token_iterator -#include // for tokenizer -#include // for assert -#include // for system_clock, duration,... -#include // for sqrt, fabs, abs -#include // for uint16_t, uint64_t,int16_t, int32_t -#include // for memcpy, memset -#include // for operator<< -#include // for cout, cerr -#include // for numeric_limits -#include // for map -#include // for random_device, default_random_engine, uniform_real_distribution -#include // for vector +#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t +#include "volk_gnsssdr/volk_gnsssdr_malloc.h" // for volk_gnsssdr_free, volk_gnsssdr_malloc +#include // for auto_any_base +#include // for lexical_cast +#include // for char_separator +#include // for token_iterator +#include // for tokenizer +#include // for assert +#include // for system_clock, duration,... +#include // for sqrt, fabs, abs +#include // for uint16_t, uint64_t,int16_t, int32_t +#include // for memcpy, memset +#include // for operator<< +#include // for cout, cerr +#include // for numeric_limits +#include // for map +#include // for random_device, default_random_engine, uniform_real_distribution +#include // for vector -float uniform() { +float uniform() +{ std::random_device r; std::default_random_engine e1(r()); std::uniform_real_distribution uniform_dist(-1, 1); - return uniform_dist(e1); // uniformly (-1, 1) + return uniform_dist(e1); // uniformly (-1, 1) } template -void random_floats (t *buf, unsigned n) +void random_floats(t *buf, unsigned n) { - for (unsigned i = 0; i < n; i++) - buf[i] = uniform (); + for (unsigned i = 0; i < n; i++) + buf[i] = uniform(); } void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n) @@ -56,60 +57,73 @@ void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n) std::random_device r; std::default_random_engine e2(r()); - if(type.is_complex) n *= 2; + if (type.is_complex) n *= 2; - if(type.is_float) + if (type.is_float) { - if(type.size == 8) random_floats((double *)data, n); - else random_floats((float *)data, n); + if (type.size == 8) + random_floats((double *)data, n); + else + random_floats((float *)data, n); } else { - float int_max = float(uint64_t(2) << (type.size*8)); - if(type.is_signed) int_max /= 2.0; + float int_max = float(uint64_t(2) << (type.size * 8)); + if (type.is_signed) int_max /= 2.0; std::uniform_real_distribution uniform_dist(-int_max, int_max); - for(unsigned int i = 0; i < n; i++) + for (unsigned int i = 0; i < n; i++) { float scaled_rand = uniform_dist(e2); - switch(type.size) - { - case 8: - if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand; - else ((uint64_t *)data)[i] = (uint64_t) scaled_rand; - break; - case 4: - if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; - else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; - break; - case 2: - // 16 bit multiplication saturates very fast - // we produce here only 3 bits input range - if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 8); - else ((uint16_t *)data)[i] = (uint16_t) (int16_t)((int16_t) scaled_rand % 8); - break; - case 1: - if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand; - else ((uint8_t *)data)[i] = (uint8_t) scaled_rand; - break; - default: - throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here - } + switch (type.size) + { + case 8: + if (type.is_signed) + ((int64_t *)data)[i] = (int64_t)scaled_rand; + else + ((uint64_t *)data)[i] = (uint64_t)scaled_rand; + break; + case 4: + if (type.is_signed) + ((int32_t *)data)[i] = (int32_t)scaled_rand; + else + ((uint32_t *)data)[i] = (uint32_t)scaled_rand; + break; + case 2: + // 16 bit multiplication saturates very fast + // we produce here only 3 bits input range + if (type.is_signed) + ((int16_t *)data)[i] = (int16_t)((int16_t)scaled_rand % 8); + else + ((uint16_t *)data)[i] = (uint16_t)(int16_t)((int16_t)scaled_rand % 8); + break; + case 1: + if (type.is_signed) + ((int8_t *)data)[i] = (int8_t)scaled_rand; + else + ((uint8_t *)data)[i] = (uint8_t)scaled_rand; + break; + default: + throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here + } } } } -static std::vector get_arch_list(volk_gnsssdr_func_desc_t desc) { +static std::vector get_arch_list(volk_gnsssdr_func_desc_t desc) +{ std::vector archlist; - for(size_t i = 0; i < desc.n_impls; i++) { - archlist.push_back(std::string(desc.impl_names[i])); - } + for (size_t i = 0; i < desc.n_impls; i++) + { + archlist.push_back(std::string(desc.impl_names[i])); + } return archlist; } -volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) { +volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) +{ volk_gnsssdr_type_t type; type.is_float = false; type.is_scalar = false; @@ -118,52 +132,58 @@ volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) { type.size = 0; type.str = name; - if(name.size() < 2) { - throw std::string("name too short to be a datatype"); - } + if (name.size() < 2) + { + throw std::string("name too short to be a datatype"); + } //is it a scalar? - if(name[0] == 's') { - type.is_scalar = true; - name = name.substr(1, name.size()-1); - } + if (name[0] == 's') + { + type.is_scalar = true; + name = name.substr(1, name.size() - 1); + } //get the data size size_t last_size_pos = name.find_last_of("0123456789"); - if(last_size_pos == std::string::npos) { - throw std::string("no size spec in type ").append(name); - } + if (last_size_pos == std::string::npos) + { + throw std::string("no size spec in type ").append(name); + } //will throw if malformed - int size = boost::lexical_cast(name.substr(0, last_size_pos+1)); + int size = boost::lexical_cast(name.substr(0, last_size_pos + 1)); assert(((size % 8) == 0) && (size <= 64) && (size != 0)); - type.size = size/8; //in bytes + type.size = size / 8; //in bytes - for(size_t i=last_size_pos+1; i < name.size(); i++) { - switch (name[i]) { - case 'f': - type.is_float = true; - break; - case 'i': - type.is_signed = true; - break; - case 'c': - type.is_complex = true; - break; - case 'u': - type.is_signed = false; - break; - default: - throw; + for (size_t i = last_size_pos + 1; i < name.size(); i++) + { + switch (name[i]) + { + case 'f': + type.is_float = true; + break; + case 'i': + type.is_signed = true; + break; + case 'c': + type.is_complex = true; + break; + case 'u': + type.is_signed = false; + break; + default: + throw; + } } - } return type; } static void get_signatures_from_name(std::vector &inputsig, - std::vector &outputsig, - std::string name) { + std::vector &outputsig, + std::string name) +{ boost::char_separator sep("_"); boost::tokenizer > tok(name, sep); std::vector toked; @@ -176,233 +196,282 @@ static void get_signatures_from_name(std::vector &inputsig, //ok. we're assuming a string in the form //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) - enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; + enum + { + SIDE_INPUT, + SIDE_NAME, + SIDE_OUTPUT + } side = SIDE_INPUT; std::string fn_name; volk_gnsssdr_type_t type; - BOOST_FOREACH(std::string token, toked) { - try { - type = volk_gnsssdr_type_from_string(token); - if(side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... + BOOST_FOREACH (std::string token, toked) + { + try + { + type = volk_gnsssdr_type_from_string(token); + if (side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... - if(side == SIDE_INPUT) inputsig.push_back(type); - else outputsig.push_back(type); - } catch (...){ - if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' || token[1] < '9')) { - if(side == SIDE_INPUT) assert(inputsig.size() > 0); - else assert(outputsig.size() > 0); - int multiplier = boost::lexical_cast(token.substr(1, token.size()-1)); //will throw if invalid /////////// - for(int i=1; i 1) && (token[1] > '0' || token[1] < '9')) + { + if (side == SIDE_INPUT) + assert(inputsig.size() > 0); + else + assert(outputsig.size() > 0); + int multiplier = boost::lexical_cast(token.substr(1, token.size() - 1)); //will throw if invalid /////////// + for (int i = 1; i < multiplier; i++) + { + if (side == SIDE_INPUT) + inputsig.push_back(inputsig.back()); + else + outputsig.push_back(outputsig.back()); + } + } - else if(side == SIDE_INPUT) { //it's the function name, at least it better be - side = SIDE_NAME; - fn_name.append("_"); - fn_name.append(token); - } - else if(side == SIDE_OUTPUT) { - if(token != toked.back()) throw; //the last token in the name is the alignment - } + else if (side == SIDE_INPUT) + { //it's the function name, at least it better be + side = SIDE_NAME; + fn_name.append("_"); + fn_name.append(token); + } + else if (side == SIDE_OUTPUT) + { + if (token != toked.back()) throw; //the last token in the name is the alignment + } + } } - } //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! assert(inputsig.size() != 0); - } -inline void run_cast_test1(volk_gnsssdr_fn_1arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], vlen, arch.c_str()); +inline void run_cast_test1(volk_gnsssdr_fn_1arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], vlen, arch.c_str()); } -inline void run_cast_test2(volk_gnsssdr_fn_2arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); +inline void run_cast_test2(volk_gnsssdr_fn_2arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); } -inline void run_cast_test3(volk_gnsssdr_fn_3arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); +inline void run_cast_test3(volk_gnsssdr_fn_3arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); } -inline void run_cast_test4(volk_gnsssdr_fn_4arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); +inline void run_cast_test4(volk_gnsssdr_fn_4arg func, std::vector &buffs, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); } -inline void run_cast_test1_s32f(volk_gnsssdr_fn_1arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +inline void run_cast_test1_s32f(volk_gnsssdr_fn_1arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } -inline void run_cast_test2_s32f(volk_gnsssdr_fn_2arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +inline void run_cast_test2_s32f(volk_gnsssdr_fn_2arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } -inline void run_cast_test3_s32f(volk_gnsssdr_fn_3arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +inline void run_cast_test3_s32f(volk_gnsssdr_fn_3arg_s32f func, std::vector &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } -inline void run_cast_test1_s32fc(volk_gnsssdr_fn_1arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); +inline void run_cast_test1_s32fc(volk_gnsssdr_fn_1arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } -inline void run_cast_test2_s32fc(volk_gnsssdr_fn_2arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); +inline void run_cast_test2_s32fc(volk_gnsssdr_fn_2arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } -inline void run_cast_test3_s32fc(volk_gnsssdr_fn_3arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); +inline void run_cast_test3_s32fc(volk_gnsssdr_fn_3arg_s32fc func, std::vector &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) +{ + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } // *************** ADDED BY GNSS-SDR. START inline void run_cast_test1_s8i(volk_gnsssdr_fn_1arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } inline void run_cast_test2_s8i(volk_gnsssdr_fn_2arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } inline void run_cast_test3_s8i(volk_gnsssdr_fn_3arg_s8i func, std::vector &buffs, char scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } inline void run_cast_test1_s8ic(volk_gnsssdr_fn_1arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } inline void run_cast_test2_s8ic(volk_gnsssdr_fn_2arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } inline void run_cast_test3_s8ic(volk_gnsssdr_fn_3arg_s8ic func, std::vector &buffs, lv_8sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } inline void run_cast_test1_s16ic(volk_gnsssdr_fn_1arg_s16ic func, std::vector &buffs, lv_16sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); } inline void run_cast_test2_s16ic(volk_gnsssdr_fn_2arg_s16ic func, std::vector &buffs, lv_16sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); } inline void run_cast_test3_s16ic(volk_gnsssdr_fn_3arg_s16ic func, std::vector &buffs, lv_16sc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { - while(iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); + while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); } // *************** ADDED BY GNSS-SDR. END template -bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { +bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) +{ bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i tol ) - { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); - std::cout << " tolerance was: " << tol << std::endl; + for (unsigned int i = 0; i < vlen; i++) + { + // for very small numbers we'll see round off errors due to limited + // precision. So a special test case... + if (fabs(((t *)(in1))[i]) < 1e-30) + { + if (fabs(((t *)(in2))[i]) > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); + std::cout << " tolerance was: " << tol << std::endl; + } + } + } + // the primary test is the percent different greater than given tol + else if (fabs(((t *)(in1))[i] - ((t *)(in2))[i]) / fabs(((t *)in1)[i]) > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); + std::cout << " tolerance was: " << tol << std::endl; + } } - } } - // the primary test is the percent different greater than given tol - else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); - std::cout << " tolerance was: " << tol << std::endl; - } - } - } return fail; } template -bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) { +bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) +{ bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i<2*vlen; i+=2) { - t diff[2] = { in1[i] - in2[i], in1[i+1] - in2[i+1] }; - t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); - t norm = std::sqrt(in1[i] * in1[i] + in1[i+1] * in1[i+1]); + for (unsigned int i = 0; i < 2 * vlen; i += 2) + { + t diff[2] = {in1[i] - in2[i], in1[i + 1] - in2[i + 1]}; + t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); + t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]); - // for very small numbers we'll see round off errors due to limited - // precision. So a special test case... - if (norm < 1e-30) { - if (err > tol) - { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; - std::cout << " tolerance was: " << tol << std::endl; + // for very small numbers we'll see round off errors due to limited + // precision. So a special test case... + if (norm < 1e-30) + { + if (err > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] << "j"; + std::cout << " tolerance was: " << tol << std::endl; + } + } + } + // the primary test is the percent different greater than given tol + else if ((err / norm) > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] << "j"; + std::cout << " tolerance was: " << tol << std::endl; + } } - } } - // the primary test is the percent different greater than given tol - else if((err / norm) > tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i/2 << " in1: " << in1[i] << " + " << in1[i+1] << "j in2: " << in2[i] << " + " << in2[i+1] << "j"; - std::cout << " tolerance was: " << tol << std::endl; - } - } - } return fail; } template -bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { +bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) +{ bool fail = false; int print_max_errs = 10; - for(unsigned int i=0; i tol) { - fail=true; - if(print_max_errs-- > 0) { - std::cout << "offset " << i << " in1: " << static_cast(t(((t *)(in1))[i])) << " in2: " << static_cast(t(((t *)(in2))[i])); - std::cout << " tolerance was: " << tol << std::endl; - } + for (unsigned int i = 0; i < vlen; i++) + { + if (((unsigned int)abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i]))) > tol) + { + fail = true; + if (print_max_errs-- > 0) + { + std::cout << "offset " << i << " in1: " << static_cast(t(((t *)(in1))[i])) << " in2: " << static_cast(t(((t *)(in2))[i])); + std::cout << " tolerance was: " << tol << std::endl; + } + } } - } return fail; } -class volk_gnsssdr_qa_aligned_mem_pool{ +class volk_gnsssdr_qa_aligned_mem_pool +{ public: - void *get_new(size_t size){ + void *get_new(size_t size) + { size_t alignment = volk_gnsssdr_get_alignment(); - void* ptr = volk_gnsssdr_malloc(size, alignment); + void *ptr = volk_gnsssdr_malloc(size, alignment); memset(ptr, 0x00, size); _mems.push_back(ptr); return ptr; } - ~volk_gnsssdr_qa_aligned_mem_pool() { - for(unsigned int ii = 0; ii < _mems.size(); ++ii) { - volk_gnsssdr_free(_mems[ii]); - } + ~volk_gnsssdr_qa_aligned_mem_pool() + { + for (unsigned int ii = 0; ii < _mems.size(); ++ii) + { + volk_gnsssdr_free(_mems[ii]); + } } -private: std::vector _mems; + +private: + std::vector _mems; }; bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, - void (*manual_func)(), - std::string name, - volk_gnsssdr_test_params_t test_params, - std::vector *results, - std::string puppet_master_name -) + void (*manual_func)(), + std::string name, + volk_gnsssdr_test_params_t test_params, + std::vector *results, + std::string puppet_master_name) { return run_volk_gnsssdr_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(), test_params.vlen(), test_params.iter(), results, puppet_master_name, @@ -410,15 +479,15 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, } bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, - void (*manual_func)(), - std::string name, - float tol, - lv_32fc_t scalar, - unsigned int vlen, - unsigned int iter, - std::vector *results, - std::string puppet_master_name, - bool benchmark_mode) + void (*manual_func)(), + std::string name, + float tol, + lv_32fc_t scalar, + unsigned int vlen, + unsigned int iter, + std::vector *results, + std::string puppet_master_name, + bool benchmark_mode) { // Initialize this entry in results vector results->push_back(volk_gnsssdr_test_results_t()); @@ -439,57 +508,67 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, //first let's get a list of available architectures for the test std::vector arch_list = get_arch_list(desc); - if((!benchmark_mode) && (arch_list.size() < 2)) { - std::cout << "no architectures to test" << std::endl; - return false; - } + if ((!benchmark_mode) && (arch_list.size() < 2)) + { + std::cout << "no architectures to test" << std::endl; + return false; + } //something that can hang onto memory and cleanup when this function exits volk_gnsssdr_qa_aligned_mem_pool mem_pool; //now we have to get a function signature by parsing the name std::vector inputsig, outputsig; - try { - get_signatures_from_name(inputsig, outputsig, name); - } - catch (boost::bad_lexical_cast& error) { - std::cerr << "Error: unable to get function signature from kernel name" << std::endl; - std::cerr << " - " << name << std::endl; - return false; - } + try + { + get_signatures_from_name(inputsig, outputsig, name); + } + catch (boost::bad_lexical_cast &error) + { + std::cerr << "Error: unable to get function signature from kernel name" << std::endl; + std::cerr << " - " << name << std::endl; + return false; + } //pull the input scalars into their own vector std::vector inputsc; - for(size_t i=0; i inbuffs; - BOOST_FOREACH(volk_gnsssdr_type_t sig, inputsig) { - if(!sig.is_scalar) //we don't make buffers for scalars - inbuffs.push_back(mem_pool.get_new(vlen*sig.size*(sig.is_complex ? 2 : 1))); - } - for(size_t i=0; i > test_data; - for(size_t i=0; i arch_buffs; - for(size_t j=0; j arch_buffs; + for (size_t j = 0; j < outputsig.size(); j++) + { + arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size * (outputsig[j].is_complex ? 2 : 1))); + } + for (size_t j = 0; j < inputsig.size(); j++) + { + void *arch_inbuff = mem_pool.get_new(vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1)); + memcpy(arch_inbuff, inbuffs[j], vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1)); + arch_buffs.push_back(arch_inbuff); + } + test_data.push_back(arch_buffs); } - for(size_t j=0; j both_sigs; both_sigs.insert(both_sigs.end(), outputsig.begin(), outputsig.end()); @@ -499,270 +578,276 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, vlen = vlen - vlen_twiddle; std::chrono::time_point start, end; std::vector profile_times; - for(size_t i = 0; i < arch_list.size(); i++) { - start = std::chrono::system_clock::now(); - - switch(both_sigs.size()) + for (size_t i = 0; i < arch_list.size(); i++) { - case 1: - if(inputsc.size() == 0) + start = std::chrono::system_clock::now(); + + switch (both_sigs.size()) { - run_cast_test1((volk_gnsssdr_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } - else if(inputsc.size() == 1 && inputsc[0].is_float) - { - if(inputsc[0].is_complex) + case 1: + if (inputsc.size() == 0) { - run_cast_test1_s32fc((volk_gnsssdr_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test1((volk_gnsssdr_fn_1arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } - else + else if (inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } - //ADDED BY GNSS-SDR. START - else if(inputsc.size() == 1 && !inputsc[0].is_float) - { - if(inputsc[0].is_complex) - { - if(inputsc[0].size == 2) + if (inputsc[0].is_complex) { - run_cast_test1_s16ic((volk_gnsssdr_fn_1arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test1_s32fc((volk_gnsssdr_fn_1arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } else { - run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test1_s32f((volk_gnsssdr_fn_1arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); } } - else + //ADDED BY GNSS-SDR. START + else if (inputsc.size() == 1 && !inputsc[0].is_float) { - run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + if (inputsc[0].is_complex) + { + if (inputsc[0].size == 2) + { + run_cast_test1_s16ic((volk_gnsssdr_fn_1arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + else + { + run_cast_test1_s8ic((volk_gnsssdr_fn_1arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + } + else + { + run_cast_test1_s8i((volk_gnsssdr_fn_1arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } } - } - //ADDED BY GNSS-SDR. END - else throw "unsupported 1 arg function >1 scalars"; - break; - case 2: - if(inputsc.size() == 0) - { + //ADDED BY GNSS-SDR. END + else + throw "unsupported 1 arg function >1 scalars"; + break; + case 2: + if (inputsc.size() == 0) + { run_cast_test2((volk_gnsssdr_fn_2arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } - else if(inputsc.size() == 1 && inputsc[0].is_float) - { - if(inputsc[0].is_complex) - { - run_cast_test2_s32fc((volk_gnsssdr_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } - else + else if (inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } - //ADDED BY GNSS-SDR. START - else if(inputsc.size() == 1 && !inputsc[0].is_float) - { - if(inputsc[0].is_complex) - { - if(inputsc[0].size == 2) + if (inputsc[0].is_complex) { - run_cast_test2_s16ic((volk_gnsssdr_fn_2arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test2_s32fc((volk_gnsssdr_fn_2arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); } else { - run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + run_cast_test2_s32f((volk_gnsssdr_fn_2arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); } } + //ADDED BY GNSS-SDR. START + else if (inputsc.size() == 1 && !inputsc[0].is_float) + { + if (inputsc[0].is_complex) + { + if (inputsc[0].size == 2) + { + run_cast_test2_s16ic((volk_gnsssdr_fn_2arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + else + { + run_cast_test2_s8ic((volk_gnsssdr_fn_2arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + } + else + { + run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. END else + throw "unsupported 2 arg function >1 scalars"; + break; + case 3: + if (inputsc.size() == 0) { - run_cast_test2_s8i((volk_gnsssdr_fn_2arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + run_cast_test3((volk_gnsssdr_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); } - } - //ADDED BY GNSS-SDR. END - else throw "unsupported 2 arg function >1 scalars"; - break; - case 3: - if(inputsc.size() == 0) - { - run_cast_test3((volk_gnsssdr_fn_3arg)(manual_func), test_data[i], vlen, iter, arch_list[i]); - } - else if(inputsc.size() == 1 && inputsc[0].is_float) - { - if(inputsc[0].is_complex) + else if (inputsc.size() == 1 && inputsc[0].is_float) { - run_cast_test3_s32fc((volk_gnsssdr_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + if (inputsc[0].is_complex) + { + run_cast_test3_s32fc((volk_gnsssdr_fn_3arg_s32fc)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + else + { + run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } } + //ADDED BY GNSS-SDR. START + else if (inputsc.size() == 1 && !inputsc[0].is_float) + { + if (inputsc[0].is_complex) + { + { + if (inputsc[0].size == 4) + { + run_cast_test3_s16ic((volk_gnsssdr_fn_3arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + else + { + run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); + } + } + } + else + { + run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); + } + } + //ADDED BY GNSS-SDR. END else - { - run_cast_test3_s32f((volk_gnsssdr_fn_3arg_s32f)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } + throw "unsupported 3 arg function >1 scalars"; + break; + default: + throw "no function handler for this signature"; + break; } - //ADDED BY GNSS-SDR. START - else if(inputsc.size() == 1 && !inputsc[0].is_float) - { - if(inputsc[0].is_complex) - { - { - if(inputsc[0].size == 4) - { - run_cast_test3_s16ic((volk_gnsssdr_fn_3arg_s16ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); - } - else - { - run_cast_test3_s8ic((volk_gnsssdr_fn_3arg_s8ic)(manual_func), test_data[i], scalar, vlen, iter, arch_list[i]); - } - } - } - else - { - run_cast_test3_s8i((volk_gnsssdr_fn_3arg_s8i)(manual_func), test_data[i], scalar.real(), vlen, iter, arch_list[i]); - } - } - //ADDED BY GNSS-SDR. END - else throw "unsupported 3 arg function >1 scalars"; - break; - default: - throw "no function handler for this signature"; - break; + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + double arch_time = 1000.0 * elapsed_seconds.count(); + std::cout << arch_list[i] << " completed in " << arch_time << " ms" << std::endl; + volk_gnsssdr_test_time_t result; + result.name = arch_list[i]; + result.time = arch_time; + result.units = "ms"; + result.pass = true; + results->back().results[result.name] = result; + + profile_times.push_back(arch_time); } - end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - double arch_time = 1000.0 * elapsed_seconds.count(); - std::cout << arch_list[i] << " completed in " << arch_time << " ms" << std::endl; - volk_gnsssdr_test_time_t result; - result.name = arch_list[i]; - result.time = arch_time; - result.units = "ms"; - result.pass = true; - results->back().results[result.name] = result; - - profile_times.push_back(arch_time); - } - //and now compare each output to the generic output //first we have to know which output is the generic one, they aren't in order... - size_t generic_offset=0; - for(size_t i=0; i arch_results; - for(size_t i = 0; i < arch_list.size(); i++) + for (size_t i = 0; i < arch_list.size(); i++) { fail = false; - if(i != generic_offset) + if (i != generic_offset) { - for(size_t j=0; jback().results[arch_list[i]]; result->pass = !fail; @@ -778,14 +863,14 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, double best_time_u = std::numeric_limits::max(); std::string best_arch_a = "generic"; std::string best_arch_u = "generic"; - for(size_t i=0; i < arch_list.size(); i++) + for (size_t i = 0; i < arch_list.size(); i++) { - if((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) + if ((profile_times[i] < best_time_u) && arch_results[i] && desc.impl_alignment[i] == 0) { best_time_u = profile_times[i]; best_arch_u = arch_list[i]; } - if((profile_times[i] < best_time_a) && arch_results[i]) + if ((profile_times[i] < best_time_a) && arch_results[i]) { best_time_a = profile_times[i]; best_arch_a = arch_list[i]; @@ -795,11 +880,14 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, std::cout << "Best aligned arch: " << best_arch_a << std::endl; std::cout << "Best unaligned arch: " << best_arch_u << std::endl; - if(puppet_master_name == "NULL") { - results->back().config_name = name; - } else { - results->back().config_name = puppet_master_name; - } + if (puppet_master_name == "NULL") + { + results->back().config_name = name; + } + else + { + results->back().config_name = puppet_master_name; + } results->back().best_arch_a = best_arch_a; results->back().best_arch_u = best_arch_u; diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h index ad4d7e6b9..b2a66fb58 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/qa_utils.h @@ -25,17 +25,18 @@ #include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t #include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t -#include // for bool, false -#include // for NULL -#include // for map -#include // for string, basic_string -#include // for vector +#include // for bool, false +#include // for NULL +#include // for map +#include // for string, basic_string +#include // for vector /************************************************ * VOLK QA type definitions * ************************************************/ -struct volk_gnsssdr_type_t { +struct volk_gnsssdr_type_t +{ bool is_float; bool is_scalar; bool is_signed; @@ -44,80 +45,78 @@ struct volk_gnsssdr_type_t { std::string str; }; -class volk_gnsssdr_test_time_t { - public: - std::string name; - double time; - std::string units; - bool pass; +class volk_gnsssdr_test_time_t +{ +public: + std::string name; + double time; + std::string units; + bool pass; }; -class volk_gnsssdr_test_results_t { - public: - std::string name; - std::string config_name; - unsigned int vlen; - unsigned int iter; - std::map results; - std::string best_arch_a; - std::string best_arch_u; +class volk_gnsssdr_test_results_t +{ +public: + std::string name; + std::string config_name; + unsigned int vlen; + unsigned int iter; + std::map results; + std::string best_arch_a; + std::string best_arch_u; }; -class volk_gnsssdr_test_params_t { - private: - float _tol; - lv_32fc_t _scalar; - unsigned int _vlen; - unsigned int _iter; - bool _benchmark_mode; - std::string _kernel_regex; - public: - // ctor - volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, - bool benchmark_mode, std::string kernel_regex) : - _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), - _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex) {}; - // setters - void set_tol(float tol) {_tol=tol;}; - void set_scalar(lv_32fc_t scalar) {_scalar=scalar;}; - void set_vlen(unsigned int vlen) {_vlen=vlen;}; - void set_iter(unsigned int iter) {_iter=iter;}; - void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;}; - void set_regex(std::string regex) {_kernel_regex=regex;}; - // getters - float tol() {return _tol;}; - lv_32fc_t scalar() {return _scalar;}; - unsigned int vlen() {return _vlen;}; - unsigned int iter() {return _iter;}; - bool benchmark_mode() {return _benchmark_mode;}; - std::string kernel_regex() {return _kernel_regex;}; +class volk_gnsssdr_test_params_t +{ +private: + float _tol; + lv_32fc_t _scalar; + unsigned int _vlen; + unsigned int _iter; + bool _benchmark_mode; + std::string _kernel_regex; + +public: + // ctor + volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, + bool benchmark_mode, std::string kernel_regex) : _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex){}; + // setters + void set_tol(float tol) { _tol = tol; }; + void set_scalar(lv_32fc_t scalar) { _scalar = scalar; }; + void set_vlen(unsigned int vlen) { _vlen = vlen; }; + void set_iter(unsigned int iter) { _iter = iter; }; + void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; }; + void set_regex(std::string regex) { _kernel_regex = regex; }; + // getters + float tol() { return _tol; }; + lv_32fc_t scalar() { return _scalar; }; + unsigned int vlen() { return _vlen; }; + unsigned int iter() { return _iter; }; + bool benchmark_mode() { return _benchmark_mode; }; + std::string kernel_regex() { return _kernel_regex; }; }; -class volk_gnsssdr_test_case_t { - private: - volk_gnsssdr_func_desc_t _desc; - void(*_kernel_ptr)(); - std::string _name; - volk_gnsssdr_test_params_t _test_parameters; - std::string _puppet_master_name; - public: - volk_gnsssdr_func_desc_t desc() {return _desc;}; - void (*kernel_ptr()) () {return _kernel_ptr;}; - std::string name() {return _name;}; - std::string puppet_master_name() {return _puppet_master_name;}; - volk_gnsssdr_test_params_t test_parameters() {return _test_parameters;}; - // normal ctor - volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name, - volk_gnsssdr_test_params_t test_parameters) : - _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), - _puppet_master_name("NULL") - {}; - // ctor for puppets - volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name, - std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : - _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), - _puppet_master_name(puppet_master_name) - {}; +class volk_gnsssdr_test_case_t +{ +private: + volk_gnsssdr_func_desc_t _desc; + void (*_kernel_ptr)(); + std::string _name; + volk_gnsssdr_test_params_t _test_parameters; + std::string _puppet_master_name; + +public: + volk_gnsssdr_func_desc_t desc() { return _desc; }; + void (*kernel_ptr())() { return _kernel_ptr; }; + std::string name() { return _name; }; + std::string puppet_master_name() { return _puppet_master_name; }; + volk_gnsssdr_test_params_t test_parameters() { return _test_parameters; }; + // normal ctor + volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name, + volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name("NULL"){}; + // ctor for puppets + volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name, + std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name(puppet_master_name){}; }; /************************************************ @@ -130,58 +129,57 @@ void random_floats(float *buf, unsigned n); bool run_volk_gnsssdr_tests( volk_gnsssdr_func_desc_t, - void(*)(), + void (*)(), std::string, volk_gnsssdr_test_params_t, std::vector *results = NULL, - std::string puppet_master_name = "NULL" - ); + std::string puppet_master_name = "NULL"); bool run_volk_gnsssdr_tests( - volk_gnsssdr_func_desc_t, - void(*)(), - std::string, - float, - lv_32fc_t, - unsigned int, - unsigned int, - std::vector *results = NULL, - std::string puppet_master_name = "NULL", - bool benchmark_mode = false -); + volk_gnsssdr_func_desc_t, + void (*)(), + std::string, + float, + lv_32fc_t, + unsigned int, + unsigned int, + std::vector *results = NULL, + std::string puppet_master_name = "NULL", + bool benchmark_mode = false); -#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ - BOOST_AUTO_TEST_CASE(func##_test) { \ - BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ - func##_get_func_desc(), (void (*)())func##_manual, \ - std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ - 0); \ +#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ + BOOST_AUTO_TEST_CASE(func##_test) \ + { \ + BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ + func##_get_func_desc(), (void (*)())func##_manual, \ + std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ + 0); \ } #define VOLK_PROFILE(func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL") #define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func)) -typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place -typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input -typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input -typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char *); //one input, operate in place +typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char *); //one input vector, one scalar float input +typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char *); //one input vector, one scalar float input +typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char *); //ADDED BY GNSS-SDR. START -typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input -typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input -typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char*); //one input vector, one scalar lv_16sc_t vector input -typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char*); -typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char*); +typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char *); //one input vector, one scalar char input +typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char *); //one input vector, one scalar lv_8sc_t vector input +typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char *); //one input vector, one scalar lv_16sc_t vector input +typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char *); +typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char *); //ADDED BY GNSS-SDR. END -#endif // GNSS_SDR_VOLK_QA_UTILS_H +#endif // GNSS_SDR_VOLK_QA_UTILS_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/testqa.cc b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/testqa.cc index 6e1f0fb61..7e22442da 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/testqa.cc +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/testqa.cc @@ -18,16 +18,16 @@ */ -#include "kernel_tests.h" // for init_test_list -#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t +#include "kernel_tests.h" // for init_test_list +#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t #include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t -#include // for bool, false, true -#include // for operator<<, basic_ostream, endl, char... -#include // IWYU pragma: keep -#include // for map, map<>::iterator, _Rb_tree_iterator -#include // for string, operator<< -#include // for pair -#include // for vector +#include // for bool, false, true +#include // for operator<<, basic_ostream, endl, char... +#include // IWYU pragma: keep +#include // for map, map<>::iterator, _Rb_tree_iterator +#include // for string, operator<< +#include // for pair +#include // for vector void print_qa_xml(std::vector results, unsigned int nfails); @@ -49,38 +49,44 @@ int main() std::vector qa_failures; std::vector results; // Test every kernel reporting failures when they occur - for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { - bool qa_result = false; - volk_gnsssdr_test_case_t test_case = test_cases[ii]; - try { - qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), - test_case.test_parameters(), &results, test_case.puppet_master_name()); - } - catch(...) { - // TODO: what exceptions might we need to catch and how do we handle them? - std::cerr << "Exception found on kernel: " << test_case.name() << std::endl; - qa_result = false; - } + for (unsigned int ii = 0; ii < test_cases.size(); ++ii) + { + bool qa_result = false; + volk_gnsssdr_test_case_t test_case = test_cases[ii]; + try + { + qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), + test_case.test_parameters(), &results, test_case.puppet_master_name()); + } + catch (...) + { + // TODO: what exceptions might we need to catch and how do we handle them? + std::cerr << "Exception found on kernel: " << test_case.name() << std::endl; + qa_result = false; + } - if(qa_result) { - std::cerr << "Failure on " << test_case.name() << std::endl; - qa_failures.push_back(test_case.name()); + if (qa_result) + { + std::cerr << "Failure on " << test_case.name() << std::endl; + qa_failures.push_back(test_case.name()); + } } - } // Generate XML results print_qa_xml(results, qa_failures.size()); // Summarize QA results std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of " - << test_cases.size() << " tests." << std::endl; - if(qa_failures.size() > 0) { - std::cerr << "The following kernels failed QA:" << std::endl; - for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) { - std::cerr << " " << qa_failures[ii] << std::endl; + << test_cases.size() << " tests." << std::endl; + if (qa_failures.size() > 0) + { + std::cerr << "The following kernels failed QA:" << std::endl; + for (unsigned int ii = 0; ii < qa_failures.size(); ++ii) + { + std::cerr << " " << qa_failures[ii] << std::endl; + } + qa_ret_val = 1; } - qa_ret_val = 1; - } return qa_ret_val; } @@ -95,34 +101,34 @@ void print_qa_xml(std::vector results, unsigned int qa_file.open(".unittest/kernels.xml"); qa_file << "" << std::endl; - qa_file << "" << std::endl; + qa_file << "" << std::endl; // Results are in a vector by kernel. Each element has a result // map containing time and arch name with test result - for(unsigned int ii=0; ii < results.size(); ++ii) { - volk_gnsssdr_test_results_t result = results[ii]; - qa_file << " " << std::endl; + for (unsigned int ii = 0; ii < results.size(); ++ii) + { + volk_gnsssdr_test_results_t result = results[ii]; + qa_file << " " << std::endl; - std::map::iterator kernel_time_pair; - for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) { - volk_gnsssdr_test_time_t test_time = kernel_time_pair->second; - qa_file << " " << std::endl; - if(!test_time.pass) - qa_file << " " << - "" << std::endl; - qa_file << " " << std::endl; + std::map::iterator kernel_time_pair; + for (kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) + { + volk_gnsssdr_test_time_t test_time = kernel_time_pair->second; + qa_file << " " << std::endl; + if (!test_time.pass) + qa_file << " " + << "" << std::endl; + qa_file << " " << std::endl; + } + qa_file << " " << std::endl; } - qa_file << " " << std::endl; - } qa_file << "" << std::endl; qa_file.close(); - } - diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_malloc.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_malloc.c index 3f1bcdd44..d92325f48 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_malloc.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_malloc.c @@ -43,15 +43,16 @@ void *volk_gnsssdr_malloc(size_t size, size_t alignment) return malloc(size); int err = posix_memalign(&ptr, alignment, size); - if(err == 0) + if (err == 0) { return ptr; } else { fprintf(stderr, - "VOLK_GNSSSDR: Error allocating memory " - "(posix_memalign: error %d: %s)\n", err, strerror(err)); + "VOLK_GNSSSDR: Error allocating memory " + "(posix_memalign: error %d: %s)\n", + err, strerror(err)); return NULL; } } @@ -68,7 +69,7 @@ void volk_gnsssdr_free(void *ptr) void *volk_gnsssdr_malloc(size_t size, size_t alignment) { void *ptr = _aligned_malloc(size, alignment); - if(ptr == NULL) + if (ptr == NULL) { fprintf(stderr, "VOLK_GNSSSDR: Error allocating memory (_aligned_malloc)\n"); } @@ -81,7 +82,7 @@ void volk_gnsssdr_free(void *ptr) } // No standard handlers; we'll do it ourselves. -#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN +#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN struct block_info { @@ -102,7 +103,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment) real = malloc(size + (2 * alignment - 1)); /* Get pointer to the various zones */ - user = (void *)((((uintptr_t) real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1)); + user = (void *)((((uintptr_t)real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1)); info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info)); /* Store the info for the free */ @@ -112,8 +113,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment) return user; } -void -volk_gnsssdr_free(void *ptr) +void volk_gnsssdr_free(void *ptr) { struct block_info *info; @@ -124,6 +124,6 @@ volk_gnsssdr_free(void *ptr) free(info->real); } -#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN +#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN //#endif // _ISOC11_SOURCE diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_prefs.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_prefs.c index b77aed467..b9a55a284 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_prefs.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_prefs.c @@ -26,16 +26,17 @@ void volk_gnsssdr_get_config_path(char *path) { if (!path) return; const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config"; - const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; //non-hidden + const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; // non-hidden char *home = NULL; //allows config redirection via env variable home = getenv("VOLK_CONFIGPATH"); - if(home!=NULL){ - strncpy(path,home,512); - strcat(path,suffix2); - return; - } + if (home != NULL) + { + strncpy(path, home, 512); + strcat(path, suffix2); + return; + } if (home == NULL) home = getenv("HOME"); if (home == NULL) home = getenv("APPDATA"); @@ -57,16 +58,16 @@ size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **prefs_res) //get the config path volk_gnsssdr_get_config_path(path); - if (!path[0]) return n_arch_prefs; //no prefs found + if (!path[0]) return n_arch_prefs; //no prefs found config_file = fopen(path, "r"); - if(!config_file) return n_arch_prefs; //no prefs found + if (!config_file) return n_arch_prefs; //no prefs found //reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs - while(fgets(line, sizeof(line), config_file) != NULL) + while (fgets(line, sizeof(line), config_file) != NULL) { - prefs = (volk_gnsssdr_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs)); + prefs = (volk_gnsssdr_arch_pref_t *)realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs)); volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs; - if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5)) + if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5)) { n_arch_prefs++; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c index d1871426d..96fa4e77e 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.c @@ -29,7 +29,7 @@ inline unsigned __popcnt(unsigned num) { unsigned pop = 0; - while(num) + while (num) { if (num & 0x1) pop++; num >>= 1; @@ -39,15 +39,15 @@ inline unsigned __popcnt(unsigned num) #endif int volk_gnsssdr_get_index( - const char *impl_names[], //list of implementations by name - const size_t n_impls, //number of implementations available - const char *impl_name //the implementation name to find - ) + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find +) { unsigned int i; for (i = 0; i < n_impls; i++) { - if(!strncmp(impl_names[i], impl_name, 20)) + if (!strncmp(impl_names[i], impl_name, 20)) { return i; } @@ -55,24 +55,24 @@ int volk_gnsssdr_get_index( //TODO return -1; //something terrible should happen here fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n"); - return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now + return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now } int volk_gnsssdr_rank_archs( - const char *kern_name, //name of the kernel to rank - const char *impl_names[], //list of implementations by name - const int* impl_deps, //requirement mask per implementation - const bool* alignment, //alignment status of each implementation - size_t n_impls, //number of implementations available - const bool align //if false, filter aligned implementations + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int *impl_deps, //requirement mask per implementation + const bool *alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations ) { size_t i; static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs; static size_t n_arch_prefs = 0; static int prefs_loaded = 0; - if(!prefs_loaded) + if (!prefs_loaded) { n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs); prefs_loaded = 1; @@ -81,17 +81,17 @@ int volk_gnsssdr_rank_archs( // If we've defined VOLK_GENERIC to be anything, always return the // 'generic' kernel. Used in GR's QA code. char *gen_env = getenv("VOLK_GENERIC"); - if(gen_env) + if (gen_env) { return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); } //now look for the function name in the prefs list - for(i = 0; i < n_arch_prefs; i++) + for (i = 0; i < n_arch_prefs; i++) { - if(!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it + if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it { - const char *impl_name = align? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u; + const char *impl_name = align ? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u; return volk_gnsssdr_get_index(impl_names, n_impls, impl_name); } } @@ -101,7 +101,7 @@ int volk_gnsssdr_rank_archs( size_t best_index_u = 0; int best_value_a = -1; int best_value_u = -1; - for(i = 0; i < n_impls; i++) + for (i = 0; i < n_impls; i++) { const signed val = __popcnt(impl_deps[i]); if (alignment[i] && val > best_value_a) diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h index 312fb9f47..ba0638a54 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/lib/volk_gnsssdr_rank_archs.h @@ -23,23 +23,24 @@ #include #ifdef __cplusplus -extern "C" { +extern "C" +{ #endif -int volk_gnsssdr_get_index( - const char *impl_names[], //list of implementations by name - const size_t n_impls, //number of implementations available - const char *impl_name //the implementation name to find -); + int volk_gnsssdr_get_index( + const char *impl_names[], //list of implementations by name + const size_t n_impls, //number of implementations available + const char *impl_name //the implementation name to find + ); -int volk_gnsssdr_rank_archs( - const char *kern_name, //name of the kernel to rank - const char *impl_names[], //list of implementations by name - const int* impl_deps, //requirement mask per implementation - const bool* alignment, //alignment status of each implementation - size_t n_impls, //number of implementations available - const bool align //if false, filter aligned implementations -); + int volk_gnsssdr_rank_archs( + const char *kern_name, //name of the kernel to rank + const char *impl_names[], //list of implementations by name + const int *impl_deps, //requirement mask per implementation + const bool *alignment, //alignment status of each implementation + size_t n_impls, //number of implementations available + const bool align //if false, filter aligned implementations + ); #ifdef __cplusplus } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c index 95f5f057c..482f0e461 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.c @@ -31,80 +31,90 @@ static intptr_t __alignment_mask = 0; struct volk_gnsssdr_machine *get_machine(void) { - extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; - extern unsigned int n_volk_gnsssdr_machines; - static struct volk_gnsssdr_machine *machine = NULL; + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; + static struct volk_gnsssdr_machine *machine = NULL; - if(machine != NULL) - return machine; - else { - unsigned int max_score = 0; - unsigned int i; - struct volk_gnsssdr_machine *max_machine = NULL; - for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { - if(volk_gnsssdr_machines[i]->caps > max_score) { - max_score = volk_gnsssdr_machines[i]->caps; - max_machine = volk_gnsssdr_machines[i]; + if (machine != NULL) + return machine; + else + { + unsigned int max_score = 0; + unsigned int i; + struct volk_gnsssdr_machine *max_machine = NULL; + for (i = 0; i < n_volk_gnsssdr_machines; i++) + { + if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) + { + if (volk_gnsssdr_machines[i]->caps > max_score) + { + max_score = volk_gnsssdr_machines[i]->caps; + max_machine = volk_gnsssdr_machines[i]; + } + } + } + machine = max_machine; + //printf("Using Volk machine: %s\n", machine->name); + __alignment = machine->alignment; + __alignment_mask = (intptr_t)(__alignment - 1); + return machine; } - } - } - machine = max_machine; - //printf("Using Volk machine: %s\n", machine->name); - __alignment = machine->alignment; - __alignment_mask = (intptr_t)(__alignment-1); - return machine; - } } void volk_gnsssdr_list_machines(void) { - extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; - extern unsigned int n_volk_gnsssdr_machines; + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; - unsigned int i; - for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { - printf("%s;", volk_gnsssdr_machines[i]->name); - } - } - printf("\n"); + unsigned int i; + for (i = 0; i < n_volk_gnsssdr_machines; i++) + { + if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) + { + printf("%s;", volk_gnsssdr_machines[i]->name); + } + } + printf("\n"); } -const char* volk_gnsssdr_get_machine(void) +const char *volk_gnsssdr_get_machine(void) { - extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; - extern unsigned int n_volk_gnsssdr_machines; - static struct volk_gnsssdr_machine *machine = NULL; + extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; + extern unsigned int n_volk_gnsssdr_machines; + static struct volk_gnsssdr_machine *machine = NULL; - if(machine != NULL) - return machine->name; - else { - unsigned int max_score = 0; - unsigned int i; - struct volk_gnsssdr_machine *max_machine = NULL; - for(i=0; icaps & (~volk_gnsssdr_get_lvarch()))) { - if(volk_gnsssdr_machines[i]->caps > max_score) { - max_score = volk_gnsssdr_machines[i]->caps; - max_machine = volk_gnsssdr_machines[i]; + if (machine != NULL) + return machine->name; + else + { + unsigned int max_score = 0; + unsigned int i; + struct volk_gnsssdr_machine *max_machine = NULL; + for (i = 0; i < n_volk_gnsssdr_machines; i++) + { + if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) + { + if (volk_gnsssdr_machines[i]->caps > max_score) + { + max_score = volk_gnsssdr_machines[i]->caps; + max_machine = volk_gnsssdr_machines[i]; + } + } + } + machine = max_machine; + return machine->name; } - } - } - machine = max_machine; - return machine->name; - } } size_t volk_gnsssdr_get_alignment(void) { - get_machine(); //ensures alignment is set + get_machine(); //ensures alignment is set return __alignment; } bool volk_gnsssdr_is_aligned(const void *ptr) { - return ((intptr_t)(ptr) & __alignment_mask) == 0; + return ((intptr_t)(ptr)&__alignment_mask) == 0; } #define LV_HAVE_GENERIC @@ -113,13 +123,12 @@ bool volk_gnsssdr_is_aligned(const void *ptr) %for kern in kernels: %if kern.has_dispatcher: -#include //pulls in the dispatcher +#include //pulls in the dispatcher %endif static inline void __${kern.name}_d(${kern.arglist_full}) { - %if kern.has_dispatcher: - ${kern.name}_dispatcher(${kern.arglist_names}); + % if kern.has_dispatcher : ${kern.name} _dispatcher(${kern.arglist_names}); return; %endif @@ -131,41 +140,41 @@ static inline void __${kern.name}_d(${kern.arglist_full}) %endfor 0<% end_open_parens = ')'*num_open_parens %>${end_open_parens} )){ - ${kern.name}_a(${kern.arglist_names}); + ${kern.name} _a(${kern.arglist_names}); } else{ - ${kern.name}_u(${kern.arglist_names}); + ${kern.name} _u(${kern.arglist_names}); } } static inline void __init_${kern.name}(void) { - const char *name = get_machine()->${kern.name}_name; - const char **impl_names = get_machine()->${kern.name}_impl_names; - const int *impl_deps = get_machine()->${kern.name}_impl_deps; - const bool *alignment = get_machine()->${kern.name}_impl_alignment; - const size_t n_impls = get_machine()->${kern.name}_n_impls; - const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/); - const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/); - ${kern.name}_a = get_machine()->${kern.name}_impls[index_a]; - ${kern.name}_u = get_machine()->${kern.name}_impls[index_u]; + const char *name = get_machine()->${kern.name} _name; + const char **impl_names = get_machine()->${kern.name} _impl_names; + const int *impl_deps = get_machine()->${kern.name} _impl_deps; + const bool *alignment = get_machine()->${kern.name} _impl_alignment; + const size_t n_impls = get_machine()->${kern.name} _n_impls; + const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true /*aligned*/); + const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false /*unaligned*/); + ${kern.name} _a = get_machine()->${kern.name} _impls[index_a]; + ${kern.name} _u = get_machine()->${kern.name} _impls[index_u]; - assert(${kern.name}_a); - assert(${kern.name}_u); + assert(${kern.name} _a); + assert(${kern.name} _u); - ${kern.name} = &__${kern.name}_d; + ${kern.name} = &__${kern.name} _d; } -static inline void __${kern.name}_a(${kern.arglist_full}) +static inline void __${kern.name} _a(${kern.arglist_full}) { __init_${kern.name}(); - ${kern.name}_a(${kern.arglist_names}); + ${kern.name} _a(${kern.arglist_names}); } -static inline void __${kern.name}_u(${kern.arglist_full}) +static inline void __${kern.name} _u(${kern.arglist_full}) { __init_${kern.name}(); - ${kern.name}_u(${kern.arglist_names}); + ${kern.name} _u(${kern.arglist_names}); } static inline void __${kern.name}(${kern.arglist_full}) @@ -174,34 +183,32 @@ static inline void __${kern.name}(${kern.arglist_full}) ${kern.name}(${kern.arglist_names}); } -${kern.pname} ${kern.name}_a = &__${kern.name}_a; -${kern.pname} ${kern.name}_u = &__${kern.name}_u; -${kern.pname} ${kern.name} = &__${kern.name}; +${kern.pname} ${kern.name} _a = &__${kern.name} _a; +${kern.pname} ${kern.name} _u = &__${kern.name} _u; +${kern.pname} ${kern.name} = &__${kern.name}; -void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name) +void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name) { const int index = volk_gnsssdr_get_index( - get_machine()->${kern.name}_impl_names, - get_machine()->${kern.name}_n_impls, - impl_name - ); - get_machine()->${kern.name}_impls[index]( - ${kern.arglist_names} - ); + get_machine()->${kern.name} _impl_names, + get_machine()->${kern.name} _n_impls, + impl_name); + get_machine()->${kern.name} _impls[index]( + ${kern.arglist_names}); } -volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) { - const char **impl_names = get_machine()->${kern.name}_impl_names; - const int *impl_deps = get_machine()->${kern.name}_impl_deps; - const bool *alignment = get_machine()->${kern.name}_impl_alignment; - const size_t n_impls = get_machine()->${kern.name}_n_impls; +volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void) +{ + const char **impl_names = get_machine()->${kern.name} _impl_names; + const int *impl_deps = get_machine()->${kern.name} _impl_deps; + const bool *alignment = get_machine()->${kern.name} _impl_alignment; + const size_t n_impls = get_machine()->${kern.name} _n_impls; volk_gnsssdr_func_desc_t desc = { impl_names, impl_deps, alignment, - n_impls - }; + n_impls}; return desc; } -%endfor +% endfor diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h index 556d67f8e..133eef3c3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr.tmpl.h @@ -42,7 +42,7 @@ typedef struct volk_gnsssdr_func_desc VOLK_API void volk_gnsssdr_list_machines(void); //! Returns the name of the machine this instance will use -VOLK_API const char* volk_gnsssdr_get_machine(void); +VOLK_API const char *volk_gnsssdr_get_machine(void); //! Get the machine alignment in bytes VOLK_API size_t volk_gnsssdr_get_alignment(void); @@ -74,19 +74,19 @@ VOLK_API bool volk_gnsssdr_is_aligned(const void *ptr); extern VOLK_API ${kern.pname} ${kern.name}; //! A function pointer to the fastest aligned implementation -extern VOLK_API ${kern.pname} ${kern.name}_a; +extern VOLK_API ${kern.pname} ${kern.name} _a; //! A function pointer to the fastest unaligned implementation -extern VOLK_API ${kern.pname} ${kern.name}_u; +extern VOLK_API ${kern.pname} ${kern.name} _u; //! Call into a specific implementation given by name -extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name); +extern VOLK_API void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name); //! Get description parameters for this kernel -extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void); -%endfor +extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void); +% endfor -__VOLK_DECL_END + __VOLK_DECL_END #endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/ diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h index ed55d0b58..c941407b9 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_config_fixed.tmpl.h @@ -21,7 +21,8 @@ %for i, arch in enumerate(archs): //#ifndef LV_${arch.name.upper()} -#define LV_${arch.name.upper()} ${i} +#define LV_$ \ + {arch.name.upper()} $ { i } //#endif %endfor diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c index 1d094a87a..b93781a70 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.c @@ -24,50 +24,54 @@ struct VOLK_CPU volk_gnsssdr_cpu; #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) - #define VOLK_CPU_x86 +#define VOLK_CPU_x86 #endif #if defined(VOLK_CPU_x86) //implement get cpuid for gcc compilers using a system or local copy of cpuid.h #if defined(__GNUC__) - #include - #define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3) - #define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int*)regs), *((unsigned int*)regs+1), *((unsigned int*)regs+2), *((unsigned int*)regs+3)) +#include +#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r + 0, (unsigned int *)r + 1, (unsigned int *)r + 2, (unsigned int *)r + 3) +#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int *)regs), *((unsigned int *)regs + 1), *((unsigned int *)regs + 2), *((unsigned int *)regs + 3)) - /* Return Intel AVX extended CPU capabilities register. +/* Return Intel AVX extended CPU capabilities register. * This function will bomb on non-AVX-capable machines, so * check for AVX capability before executing. */ - #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV) - static inline unsigned long long _xgetbv(unsigned int index){ - unsigned int eax, edx; - __VOLK_ASM __VOLK_VOLATILE ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); - return ((unsigned long long)edx << 32) | eax; - } - #define __xgetbv() _xgetbv(0) - #else - #define __xgetbv() 0 - #endif +#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV) +static inline unsigned long long _xgetbv(unsigned int index) +{ + unsigned int eax, edx; + __VOLK_ASM __VOLK_VOLATILE("xgetbv" + : "=a"(eax), "=d"(edx) + : "c"(index)); + return ((unsigned long long)edx << 32) | eax; +} +#define __xgetbv() _xgetbv(0) +#else +#define __xgetbv() 0 +#endif //implement get cpuid for MSVC compilers using __cpuid intrinsic #elif defined(_MSC_VER) && defined(HAVE_INTRIN_H) - #include - #define cpuid_x86(op, r) __cpuid(((int*)r), op) +#include +#define cpuid_x86(op, r) __cpuid(((int *)r), op) - #if defined(_XCR_XFEATURE_ENABLED_MASK) - #define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) - #else - #define __xgetbv() 0 - #endif +#if defined(_XCR_XFEATURE_ENABLED_MASK) +#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) +#else +#define __xgetbv() 0 +#endif #else - #error "A get cpuid for volk_gnsssdr is not available on this compiler..." -#endif //defined(__GNUC__) +#error "A get cpuid for volk_gnsssdr is not available on this compiler..." +#endif //defined(__GNUC__) -#endif //defined(VOLK_CPU_x86) +#endif //defined(VOLK_CPU_x86) -static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) { +static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) +{ #if defined(VOLK_CPU_x86) unsigned int regs[4] = {0}; cpuid_x86_count(level, count, regs); @@ -77,10 +81,11 @@ static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int #endif } -static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) { +static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) +{ #if defined(VOLK_CPU_x86) unsigned int regs[4]; - memset(regs, 0, sizeof(unsigned int)*4); + memset(regs, 0, sizeof(unsigned int) * 4); cpuid_x86(op, regs); return regs[reg] >> bit & 0x01; #else @@ -88,10 +93,11 @@ static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsi #endif } -static inline unsigned int check_extended_cpuid(unsigned int val) { +static inline unsigned int check_extended_cpuid(unsigned int val) +{ #if defined(VOLK_CPU_x86) unsigned int regs[4]; - memset(regs, 0, sizeof(unsigned int)*4); + memset(regs, 0, sizeof(unsigned int) * 4); cpuid_x86(0x80000000, regs); return regs[0] >= val; #else @@ -99,7 +105,8 @@ static inline unsigned int check_extended_cpuid(unsigned int val) { #endif } -static inline unsigned int get_avx_enabled(void) { +static inline unsigned int get_avx_enabled(void) +{ #if defined(VOLK_CPU_x86) return __xgetbv() & 0x6; #else @@ -107,7 +114,8 @@ static inline unsigned int get_avx_enabled(void) { #endif } -static inline unsigned int get_avx2_enabled(void) { +static inline unsigned int get_avx2_enabled(void) +{ #if defined(VOLK_CPU_x86) return __xgetbv() & 0x6; #else @@ -117,28 +125,30 @@ static inline unsigned int get_avx2_enabled(void) { //neon detection is linux specific #if defined(__arm__) && defined(__linux__) - #include - #include - #include - #define VOLK_CPU_ARM +#include +#include +#include +#define VOLK_CPU_ARM #endif -static int has_neon(void){ +static int has_neon(void) +{ #if defined(VOLK_CPU_ARM) FILE *auxvec_f; unsigned long auxvec[2]; unsigned int found_neon = 0; auxvec_f = fopen("/proc/self/auxv", "rb"); - if(!auxvec_f) return 0; + if (!auxvec_f) return 0; size_t r = 1; //so auxv is basically 32b of ID and 32b of value //so it goes like this - while(!found_neon && r) { - r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f); - if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) - found_neon = 1; - } + while (!found_neon && r) + { + r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f); + if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) + found_neon = 1; + } fclose(auxvec_f); return found_neon; @@ -148,50 +158,59 @@ static int has_neon(void){ } %for arch in archs: -static int i_can_has_${arch.name} (void) { +static int i_can_has_${arch.name} (void) +{ %for check, params in arch.checks: if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0; - %endfor - return 1; + % endfor return 1; } -%endfor +% endfor #if defined(HAVE_FENV_H) - #if defined(FE_TONEAREST) - #include - static inline void set_float_rounding(void){ - fesetround(FE_TONEAREST); - } - #else - static inline void set_float_rounding(void){ - //do nothing - } - #endif -#elif defined(_MSC_VER) - #include - static inline void set_float_rounding(void){ - unsigned int cwrd; - _controlfp_s(&cwrd, 0, 0); - _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC); - } +#if defined(FE_TONEAREST) +#include + static inline void + set_float_rounding(void) +{ + fesetround(FE_TONEAREST); +} #else - static inline void set_float_rounding(void){ - //do nothing - } + static inline void + set_float_rounding(void) +{ + //do nothing +} +#endif +#elif defined(_MSC_VER) +#include + static inline void + set_float_rounding(void) +{ + unsigned int cwrd; + _controlfp_s(&cwrd, 0, 0); + _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC); +} +#else + static inline void + set_float_rounding(void) +{ + //do nothing +} #endif -void volk_gnsssdr_cpu_init() { +void volk_gnsssdr_cpu_init() +{ %for arch in archs: volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name}; - %endfor - set_float_rounding(); + % endfor + set_float_rounding(); } -unsigned int volk_gnsssdr_get_lvarch() { +unsigned int volk_gnsssdr_get_lvarch() +{ unsigned int retval = 0; volk_gnsssdr_cpu_init(); %for arch in archs: retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()}; - %endfor - return retval; + % endfor return retval; } diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h index 20dbac2cc..160274eba 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_cpu.tmpl.h @@ -23,16 +23,17 @@ __VOLK_DECL_BEGIN -struct VOLK_CPU { +struct VOLK_CPU +{ %for arch in archs: int (*has_${arch.name}) (); - %endfor + % endfor }; extern struct VOLK_CPU volk_gnsssdr_cpu; -void volk_gnsssdr_cpu_init (); -unsigned int volk_gnsssdr_get_lvarch (); +void volk_gnsssdr_cpu_init(); +unsigned int volk_gnsssdr_get_lvarch(); __VOLK_DECL_END diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c index c6182cb50..8e0e7ebd3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machine_xxx.tmpl.c @@ -20,7 +20,11 @@ <% arch_names = this_machine.arch_names %> %for arch in this_machine.archs: -#define LV_HAVE_${arch.name.upper()} 1 +#define LV_HAVE_$ \ + { \ + arch.name.upper() \ + } \ + 1 %endfor #include @@ -35,7 +39,9 @@ #include %endfor -struct volk_gnsssdr_machine volk_gnsssdr_machine_${this_machine.name} = { +struct volk_gnsssdr_machine volk_gnsssdr_machine_$ +{ + this_machine.name} = { <% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list}, <% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name}, ${this_machine.alignment}, diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c index 1485a34e0..3e78b65e3 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.c @@ -22,10 +22,10 @@ struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = { %for machine in machines: -#ifdef LV_MACHINE_${machine.name.upper()} +#ifdef LV_MACHINE_${machine.name.upper() } &volk_gnsssdr_machine_${machine.name}, #endif %endfor }; -unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines)/sizeof(*volk_gnsssdr_machines); +unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines) / sizeof(*volk_gnsssdr_machines); diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h index 10e955e25..3e2cf8d2b 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_machines.tmpl.h @@ -27,26 +27,30 @@ __VOLK_DECL_BEGIN -struct volk_gnsssdr_machine { - const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format) +struct volk_gnsssdr_machine +{ + const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format) const char *name; - const size_t alignment; //the maximum byte alignment required for functions in this library + const size_t alignment; //the maximum byte alignment required for functions in this library %for kern in kernels: const char *${kern.name}_name; - const char *${kern.name}_impl_names[<%len_archs=len(archs)%>${len_archs}]; - const int ${kern.name}_impl_deps[${len_archs}]; - const bool ${kern.name}_impl_alignment[${len_archs}]; - const ${kern.pname} ${kern.name}_impls[${len_archs}]; - const size_t ${kern.name}_n_impls; - %endfor + const char *${kern.name} _impl_names[<% len_archs = len(archs) %> ${len_archs}]; + const int ${kern.name} _impl_deps[${len_archs}]; + const bool ${kern.name} _impl_alignment[${len_archs}]; + const ${kern.pname} ${kern.name} _impls[${len_archs}]; + const size_t ${kern.name} _n_impls; + % endfor }; %for machine in machines: -#ifdef LV_MACHINE_${machine.name.upper()} -extern struct volk_gnsssdr_machine volk_gnsssdr_machine_${machine.name}; +#ifdef LV_MACHINE_${machine.name.upper() } +extern struct volk_gnsssdr_machine volk_gnsssdr_machine_$ +{ + machine.name +}; #endif -%endfor +% endfor -__VOLK_DECL_END + __VOLK_DECL_END -#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H +#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H diff --git a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h index def7e24c3..e28aa5392 100644 --- a/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h +++ b/src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/tmpl/volk_gnsssdr_typedefs.tmpl.h @@ -24,6 +24,6 @@ %for kern in kernels: typedef void (*${kern.pname})(${kern.arglist_types}); -%endfor +% endfor #endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/