mirror of
https://github.com/gnss-sdr/gnss-sdr
synced 2024-12-15 12:40:35 +00:00
Apply automated code formatting to volk-gnsssdr
See http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting
This commit is contained in:
parent
f924005733
commit
891478cf2c
@ -20,30 +20,30 @@
|
|||||||
#include <config.h>
|
#include <config.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine
|
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine
|
||||||
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
|
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
|
||||||
#include <volk_gnsssdr/constants.h> // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ...
|
#include <volk_gnsssdr/constants.h> // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ...
|
||||||
#include <iostream> // for operator<<, endl, cout, ostream
|
#include <iostream> // for operator<<, endl, cout, ostream
|
||||||
#include <string> // for string
|
#include <string> // for string
|
||||||
|
|
||||||
void print_alignment()
|
void print_alignment()
|
||||||
{
|
{
|
||||||
std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl;
|
std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void print_malloc()
|
void print_malloc()
|
||||||
{
|
{
|
||||||
// You don't want to change the volk_malloc code, so just copy the if/else
|
// You don't want to change the volk_malloc code, so just copy the if/else
|
||||||
// structure from there and give an explanation for the implementations
|
// structure from there and give an explanation for the implementations
|
||||||
std::cout << "Used malloc implementation: ";
|
std::cout << "Used malloc implementation: ";
|
||||||
#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
|
#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
|
||||||
std::cout << "posix_memalign" << std::endl;
|
std::cout << "posix_memalign" << std::endl;
|
||||||
#elif _MSC_VER >= 1400
|
#elif _MSC_VER >= 1400
|
||||||
std::cout << "aligned_malloc" << std::endl;
|
std::cout << "aligned_malloc" << std::endl;
|
||||||
#else
|
#else
|
||||||
std::cout << "No standard handler available, using own implementation." << std::endl;
|
std::cout << "No standard handler available, using own implementation." << std::endl;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -54,22 +54,24 @@ int main(int argc, char **argv)
|
|||||||
our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler()));
|
our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler()));
|
||||||
our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags()));
|
our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags()));
|
||||||
our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines()));
|
our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines()));
|
||||||
our_options.add(option_t("avail-machines", "", "print VOLK_GNSSSDR machines on the current "
|
our_options.add(option_t("avail-machines", "",
|
||||||
"platform", volk_gnsssdr_list_machines));
|
"print VOLK_GNSSSDR machines on the current "
|
||||||
|
"platform",
|
||||||
|
volk_gnsssdr_list_machines));
|
||||||
our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used",
|
our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used",
|
||||||
volk_gnsssdr_get_machine()));
|
volk_gnsssdr_get_machine()));
|
||||||
our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment));
|
our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment));
|
||||||
our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_gnsssdr_malloc",
|
our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_gnsssdr_malloc",
|
||||||
print_malloc));
|
print_malloc));
|
||||||
our_options.add(option_t("version", "v", "print the VOLK_GNSSSDR version", volk_gnsssdr_version()));
|
our_options.add(option_t("version", "v", "print the VOLK_GNSSSDR version", volk_gnsssdr_version()));
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
our_options.parse(argc, argv);
|
our_options.parse(argc, argv);
|
||||||
}
|
}
|
||||||
catch(...)
|
catch (...)
|
||||||
{
|
{
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -17,157 +17,182 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "volk_gnsssdr_option_helpers.h"
|
#include "volk_gnsssdr_option_helpers.h"
|
||||||
#include <climits> // IWYU pragma: keep
|
#include <climits> // IWYU pragma: keep
|
||||||
#include <cstdlib> // IWYU pragma: keep
|
#include <cstdlib> // IWYU pragma: keep
|
||||||
#include <cstring> // IWYU pragma: keep
|
#include <cstring> // IWYU pragma: keep
|
||||||
#include <exception> // for exception
|
#include <exception> // for exception
|
||||||
#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
|
#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
|
||||||
#include <utility> // for pair
|
#include <utility> // for pair
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Option type
|
* Option type
|
||||||
*/
|
*/
|
||||||
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)())
|
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)())
|
||||||
: longform("--" + longform),
|
: longform("--" + longform),
|
||||||
shortform("-" + shortform),
|
shortform("-" + shortform),
|
||||||
msg(msg),
|
msg(msg),
|
||||||
callback(callback) { option_type = VOID_CALLBACK; }
|
callback(callback) { option_type = VOID_CALLBACK; }
|
||||||
|
|
||||||
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int))
|
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int))
|
||||||
: longform("--" + longform),
|
: longform("--" + longform),
|
||||||
shortform("-" + shortform),
|
shortform("-" + shortform),
|
||||||
msg(msg),
|
msg(msg),
|
||||||
callback((void (*)()) callback) { option_type = INT_CALLBACK; }
|
callback((void (*)())callback) { option_type = INT_CALLBACK; }
|
||||||
|
|
||||||
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float))
|
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float))
|
||||||
: longform("--" + longform),
|
: longform("--" + longform),
|
||||||
shortform("-" + shortform),
|
shortform("-" + shortform),
|
||||||
msg(msg),
|
msg(msg),
|
||||||
callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; }
|
callback((void (*)())callback) { option_type = FLOAT_CALLBACK; }
|
||||||
|
|
||||||
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool))
|
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool))
|
||||||
: longform("--" + longform),
|
: longform("--" + longform),
|
||||||
shortform("-" + shortform),
|
shortform("-" + shortform),
|
||||||
msg(msg),
|
msg(msg),
|
||||||
callback((void (*)()) callback) { option_type = BOOL_CALLBACK; }
|
callback((void (*)())callback) { option_type = BOOL_CALLBACK; }
|
||||||
|
|
||||||
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string))
|
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string))
|
||||||
: longform("--" + longform),
|
: longform("--" + longform),
|
||||||
shortform("-" + shortform),
|
shortform("-" + shortform),
|
||||||
msg(msg),
|
msg(msg),
|
||||||
callback((void (*)()) callback) { option_type = STRING_CALLBACK; }
|
callback((void (*)())callback) { option_type = STRING_CALLBACK; }
|
||||||
|
|
||||||
option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval)
|
option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval)
|
||||||
: longform("--" + longform),
|
: longform("--" + longform),
|
||||||
shortform("-" + shortform),
|
shortform("-" + shortform),
|
||||||
msg(msg),
|
msg(msg),
|
||||||
printval(printval) { option_type = STRING; }
|
printval(printval) { option_type = STRING; }
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Option List
|
* Option List
|
||||||
*/
|
*/
|
||||||
|
|
||||||
option_list::option_list(std::string program_name) :
|
option_list::option_list(std::string program_name) : program_name(program_name)
|
||||||
program_name(program_name) {
|
{
|
||||||
{ internal_list = std::vector<option_t>(); }
|
{
|
||||||
}
|
internal_list = std::vector<option_t>();
|
||||||
|
|
||||||
void option_list::add(const option_t & opt) { internal_list.push_back(opt); }
|
|
||||||
|
|
||||||
void option_list::parse(int argc, char **argv) {
|
|
||||||
for (int arg_number = 0; arg_number < argc; ++arg_number) {
|
|
||||||
for (std::vector<option_t>::iterator this_option = internal_list.begin();
|
|
||||||
this_option != internal_list.end();
|
|
||||||
this_option++) {
|
|
||||||
if (this_option->longform == std::string(argv[arg_number]) ||
|
|
||||||
this_option->shortform == std::string(argv[arg_number])) {
|
|
||||||
switch (this_option->option_type) {
|
|
||||||
case VOID_CALLBACK:
|
|
||||||
this_option->callback();
|
|
||||||
break;
|
|
||||||
case INT_CALLBACK:
|
|
||||||
try {
|
|
||||||
int int_val = std::stoi(argv[++arg_number]);
|
|
||||||
((void (*)(int)) this_option->callback)(int_val);
|
|
||||||
} catch (std::exception &exc) {
|
|
||||||
std::cout << "An int option can only receive a number" << std::endl;
|
|
||||||
throw std::exception();
|
|
||||||
};
|
|
||||||
break;
|
|
||||||
case FLOAT_CALLBACK:
|
|
||||||
try {
|
|
||||||
int int_val = std::stof(argv[++arg_number]);
|
|
||||||
((void (*)(float)) this_option->callback)(int_val);
|
|
||||||
} catch (std::exception &exc) {
|
|
||||||
std::cout << "A float option can only receive a number" << std::endl;
|
|
||||||
throw std::exception();
|
|
||||||
};
|
|
||||||
break;
|
|
||||||
case BOOL_CALLBACK:
|
|
||||||
try {
|
|
||||||
bool int_val = (bool) std::stoi(argv[++arg_number]);
|
|
||||||
((void (*)(bool)) this_option->callback)(int_val);
|
|
||||||
} catch (std::exception &exc) {
|
|
||||||
std::cout << "A bool option can only receive 0 or 1" << std::endl;
|
|
||||||
throw std::exception();
|
|
||||||
};
|
|
||||||
break;
|
|
||||||
case STRING_CALLBACK:
|
|
||||||
try {
|
|
||||||
((void (*)(std::string)) this_option->callback)(argv[++arg_number]);
|
|
||||||
} catch (std::exception &exc) {
|
|
||||||
throw std::exception();
|
|
||||||
};
|
|
||||||
break;
|
|
||||||
case STRING:
|
|
||||||
std::cout << this_option->printval << std::endl;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
this_option->callback();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (std::string("--help") == std::string(argv[arg_number]) ||
|
|
||||||
std::string("-h") == std::string(argv[arg_number])) {
|
|
||||||
help();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void option_list::help() {
|
void option_list::add(const option_t &opt) { internal_list.push_back(opt); }
|
||||||
|
|
||||||
|
void option_list::parse(int argc, char **argv)
|
||||||
|
{
|
||||||
|
for (int arg_number = 0; arg_number < argc; ++arg_number)
|
||||||
|
{
|
||||||
|
for (std::vector<option_t>::iterator this_option = internal_list.begin();
|
||||||
|
this_option != internal_list.end();
|
||||||
|
this_option++)
|
||||||
|
{
|
||||||
|
if (this_option->longform == std::string(argv[arg_number]) ||
|
||||||
|
this_option->shortform == std::string(argv[arg_number]))
|
||||||
|
{
|
||||||
|
switch (this_option->option_type)
|
||||||
|
{
|
||||||
|
case VOID_CALLBACK:
|
||||||
|
this_option->callback();
|
||||||
|
break;
|
||||||
|
case INT_CALLBACK:
|
||||||
|
try
|
||||||
|
{
|
||||||
|
int int_val = std::stoi(argv[++arg_number]);
|
||||||
|
((void (*)(int))this_option->callback)(int_val);
|
||||||
|
}
|
||||||
|
catch (std::exception &exc)
|
||||||
|
{
|
||||||
|
std::cout << "An int option can only receive a number" << std::endl;
|
||||||
|
throw std::exception();
|
||||||
|
};
|
||||||
|
break;
|
||||||
|
case FLOAT_CALLBACK:
|
||||||
|
try
|
||||||
|
{
|
||||||
|
int int_val = std::stof(argv[++arg_number]);
|
||||||
|
((void (*)(float))this_option->callback)(int_val);
|
||||||
|
}
|
||||||
|
catch (std::exception &exc)
|
||||||
|
{
|
||||||
|
std::cout << "A float option can only receive a number" << std::endl;
|
||||||
|
throw std::exception();
|
||||||
|
};
|
||||||
|
break;
|
||||||
|
case BOOL_CALLBACK:
|
||||||
|
try
|
||||||
|
{
|
||||||
|
bool int_val = (bool)std::stoi(argv[++arg_number]);
|
||||||
|
((void (*)(bool))this_option->callback)(int_val);
|
||||||
|
}
|
||||||
|
catch (std::exception &exc)
|
||||||
|
{
|
||||||
|
std::cout << "A bool option can only receive 0 or 1" << std::endl;
|
||||||
|
throw std::exception();
|
||||||
|
};
|
||||||
|
break;
|
||||||
|
case STRING_CALLBACK:
|
||||||
|
try
|
||||||
|
{
|
||||||
|
((void (*)(std::string))this_option->callback)(argv[++arg_number]);
|
||||||
|
}
|
||||||
|
catch (std::exception &exc)
|
||||||
|
{
|
||||||
|
throw std::exception();
|
||||||
|
};
|
||||||
|
break;
|
||||||
|
case STRING:
|
||||||
|
std::cout << this_option->printval << std::endl;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
this_option->callback();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (std::string("--help") == std::string(argv[arg_number]) ||
|
||||||
|
std::string("-h") == std::string(argv[arg_number]))
|
||||||
|
{
|
||||||
|
help();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void option_list::help()
|
||||||
|
{
|
||||||
std::cout << program_name << std::endl;
|
std::cout << program_name << std::endl;
|
||||||
std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl;
|
std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl;
|
||||||
for (std::vector<option_t>::iterator this_option = internal_list.begin();
|
for (std::vector<option_t>::iterator this_option = internal_list.begin();
|
||||||
this_option != internal_list.end();
|
this_option != internal_list.end();
|
||||||
this_option++) {
|
this_option++)
|
||||||
std::string help_line(" ");
|
{
|
||||||
if (this_option->shortform == "-") {
|
std::string help_line(" ");
|
||||||
help_line += this_option->longform + " ";
|
if (this_option->shortform == "-")
|
||||||
} else {
|
{
|
||||||
help_line += this_option->shortform + " [ " + this_option->longform + " ]";
|
help_line += this_option->longform + " ";
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
help_line += this_option->shortform + " [ " + this_option->longform + " ]";
|
||||||
|
}
|
||||||
|
|
||||||
switch (help_line.size() / 8) {
|
switch (help_line.size() / 8)
|
||||||
case 0:
|
{
|
||||||
help_line += "\t\t\t\t";
|
case 0:
|
||||||
break;
|
help_line += "\t\t\t\t";
|
||||||
case 1:
|
break;
|
||||||
help_line += "\t\t\t";
|
case 1:
|
||||||
break;
|
help_line += "\t\t\t";
|
||||||
case 2:
|
break;
|
||||||
help_line += "\t\t";
|
case 2:
|
||||||
break;
|
help_line += "\t\t";
|
||||||
case 3:
|
break;
|
||||||
help_line += "\t";
|
case 3:
|
||||||
break;
|
help_line += "\t";
|
||||||
default:
|
break;
|
||||||
break;
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
help_line += this_option->msg;
|
||||||
|
std::cout << help_line << std::endl;
|
||||||
}
|
}
|
||||||
help_line += this_option->msg;
|
|
||||||
std::cout << help_line << std::endl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -36,7 +36,8 @@ typedef enum
|
|||||||
STRING,
|
STRING,
|
||||||
} VOLK_OPTYPE;
|
} VOLK_OPTYPE;
|
||||||
|
|
||||||
class option_t {
|
class option_t
|
||||||
|
{
|
||||||
public:
|
public:
|
||||||
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)());
|
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)());
|
||||||
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int));
|
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int));
|
||||||
@ -51,7 +52,6 @@ public:
|
|||||||
VOLK_OPTYPE option_type;
|
VOLK_OPTYPE option_type;
|
||||||
std::string printval;
|
std::string printval;
|
||||||
void (*callback)();
|
void (*callback)();
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class option_list
|
class option_list
|
||||||
@ -59,15 +59,16 @@ class option_list
|
|||||||
public:
|
public:
|
||||||
option_list(std::string program_name);
|
option_list(std::string program_name);
|
||||||
|
|
||||||
void add(const option_t & opt);
|
void add(const option_t &opt);
|
||||||
|
|
||||||
void parse(int argc, char **argv);
|
void parse(int argc, char **argv);
|
||||||
|
|
||||||
void help();
|
void help();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string program_name;
|
std::string program_name;
|
||||||
std::vector<option_t> internal_list;
|
std::vector<option_t> internal_list;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#endif //VOLK_VOLK_OPTION_HELPERS_H
|
#endif //VOLK_VOLK_OPTION_HELPERS_H
|
||||||
|
@ -16,23 +16,22 @@
|
|||||||
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "kernel_tests.h" // for init_test_list
|
#include "kernel_tests.h" // for init_test_list
|
||||||
#include "qa_utils.h" // for volk_gnsssdr_test_results_t
|
#include "qa_utils.h" // for volk_gnsssdr_test_results_t
|
||||||
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
|
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
|
||||||
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
|
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
|
||||||
#include "volk_gnsssdr_profile.h"
|
#include "volk_gnsssdr_profile.h"
|
||||||
#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path
|
#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path
|
||||||
#include <boost/filesystem/operations.hpp> // for create_directories, exists
|
#include <boost/filesystem/operations.hpp> // for create_directories, exists
|
||||||
#include <boost/filesystem/path.hpp> // for path, operator<<
|
#include <boost/filesystem/path.hpp> // for path, operator<<
|
||||||
#include <boost/filesystem/path_traits.hpp> // for filesystem
|
#include <boost/filesystem/path_traits.hpp> // for filesystem
|
||||||
#include <sys/stat.h> // for stat
|
#include <sys/stat.h> // for stat
|
||||||
#include <cstddef> // for size_t
|
#include <cstddef> // for size_t
|
||||||
#include <iostream> // for operator<<, basic_ostream
|
#include <iostream> // for operator<<, basic_ostream
|
||||||
#include <fstream> // IWYU pragma: keep
|
#include <fstream> // IWYU pragma: keep
|
||||||
#include <map> // for map, map<>::iterator
|
#include <map> // for map, map<>::iterator
|
||||||
#include <utility> // for pair
|
#include <utility> // for pair
|
||||||
#include <vector> // for vector, vector<>::const_..
|
#include <vector> // for vector, vector<>::const_..
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
namespace fs = boost::filesystem;
|
namespace fs = boost::filesystem;
|
||||||
@ -67,92 +66,112 @@ int main(int argc, char *argv[])
|
|||||||
profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config)));
|
profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config)));
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
profile_options.parse(argc, argv);
|
profile_options.parse(argc, argv);
|
||||||
}
|
}
|
||||||
catch(...)
|
catch (...)
|
||||||
{
|
{
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int arg_number = 0; arg_number < argc; ++arg_number) {
|
for (int arg_number = 0; arg_number < argc; ++arg_number)
|
||||||
|
{
|
||||||
if (std::string("--help") == std::string(argv[arg_number]) ||
|
if (std::string("--help") == std::string(argv[arg_number]) ||
|
||||||
std::string("-h") == std::string(argv[arg_number])) {
|
std::string("-h") == std::string(argv[arg_number]))
|
||||||
|
{
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Adding program options
|
// Adding program options
|
||||||
std::ofstream json_file;
|
std::ofstream json_file;
|
||||||
std::string config_file;
|
std::string config_file;
|
||||||
|
|
||||||
if ( json_filename != "" ) {
|
if (json_filename != "")
|
||||||
json_file.open( json_filename.c_str() );
|
{
|
||||||
}
|
json_file.open(json_filename.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
if ( volk_config_path != "" ) {
|
if (volk_config_path != "")
|
||||||
config_file = volk_config_path + "/volk_config";
|
{
|
||||||
}
|
config_file = volk_config_path + "/volk_config";
|
||||||
|
}
|
||||||
|
|
||||||
// Run tests
|
// Run tests
|
||||||
std::vector<volk_gnsssdr_test_results_t> results;
|
std::vector<volk_gnsssdr_test_results_t> results;
|
||||||
if(update_mode) {
|
if (update_mode)
|
||||||
if( config_file != "" ) read_results(&results, config_file);
|
{
|
||||||
else read_results(&results);
|
if (config_file != "")
|
||||||
}
|
read_results(&results, config_file);
|
||||||
|
else
|
||||||
|
read_results(&results);
|
||||||
|
}
|
||||||
|
|
||||||
// Initialize the list of tests
|
// Initialize the list of tests
|
||||||
std::vector<volk_gnsssdr_test_case_t> test_cases = init_test_list(test_params);
|
std::vector<volk_gnsssdr_test_case_t> test_cases = init_test_list(test_params);
|
||||||
|
|
||||||
// Iterate through list of tests running each one
|
// Iterate through list of tests running each one
|
||||||
std::string substr_to_match(test_params.kernel_regex());
|
std::string substr_to_match(test_params.kernel_regex());
|
||||||
for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
|
for (unsigned int ii = 0; ii < test_cases.size(); ++ii)
|
||||||
bool regex_match = true;
|
{
|
||||||
|
bool regex_match = true;
|
||||||
|
|
||||||
volk_gnsssdr_test_case_t test_case = test_cases[ii];
|
volk_gnsssdr_test_case_t test_case = test_cases[ii];
|
||||||
// if the kernel name matches regex then do the test
|
// if the kernel name matches regex then do the test
|
||||||
std::string test_case_name = test_case.name();
|
std::string test_case_name = test_case.name();
|
||||||
if(test_case_name.find(substr_to_match) == std::string::npos) {
|
if (test_case_name.find(substr_to_match) == std::string::npos)
|
||||||
regex_match = false;
|
{
|
||||||
}
|
regex_match = false;
|
||||||
|
|
||||||
// if we are in update mode check if we've already got results
|
|
||||||
// if we have any, then no need to test that kernel
|
|
||||||
bool update = true;
|
|
||||||
if(update_mode) {
|
|
||||||
for(unsigned int jj=0; jj < results.size(); ++jj) {
|
|
||||||
if(results[jj].name == test_case.name() ||
|
|
||||||
results[jj].name == test_case.puppet_master_name()) {
|
|
||||||
update = false;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if( regex_match && update ) {
|
// if we are in update mode check if we've already got results
|
||||||
try {
|
// if we have any, then no need to test that kernel
|
||||||
run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
|
bool update = true;
|
||||||
test_case.test_parameters(), &results, test_case.puppet_master_name());
|
if (update_mode)
|
||||||
}
|
{
|
||||||
catch (std::string &error) {
|
for (unsigned int jj = 0; jj < results.size(); ++jj)
|
||||||
std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl;
|
{
|
||||||
}
|
if (results[jj].name == test_case.name() ||
|
||||||
|
results[jj].name == test_case.puppet_master_name())
|
||||||
|
{
|
||||||
|
update = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (regex_match && update)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
|
||||||
|
test_case.test_parameters(), &results, test_case.puppet_master_name());
|
||||||
|
}
|
||||||
|
catch (std::string &error)
|
||||||
|
{
|
||||||
|
std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Output results according to provided options
|
// Output results according to provided options
|
||||||
if(json_filename != "") {
|
if (json_filename != "")
|
||||||
write_json(json_file, results);
|
{
|
||||||
json_file.close();
|
write_json(json_file, results);
|
||||||
}
|
json_file.close();
|
||||||
|
}
|
||||||
|
|
||||||
if(!dry_run) {
|
if (!dry_run)
|
||||||
if(config_file != "") write_results(&results, false, config_file);
|
{
|
||||||
else write_results(&results, false);
|
if (config_file != "")
|
||||||
}
|
write_results(&results, false, config_file);
|
||||||
else {
|
else
|
||||||
std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
|
write_results(&results, false);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -167,51 +186,55 @@ void read_results(std::vector<volk_gnsssdr_test_results_t> *results)
|
|||||||
void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string path)
|
void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string path)
|
||||||
{
|
{
|
||||||
struct stat buffer;
|
struct stat buffer;
|
||||||
bool config_status = (stat (path.c_str(), &buffer) == 0);
|
bool config_status = (stat(path.c_str(), &buffer) == 0);
|
||||||
|
|
||||||
if( config_status ) {
|
if (config_status)
|
||||||
// a config exists and we are reading results from it
|
{
|
||||||
std::ifstream config(path.c_str());
|
// a config exists and we are reading results from it
|
||||||
char config_line[256];
|
std::ifstream config(path.c_str());
|
||||||
while(config.getline(config_line, 255)) {
|
char config_line[256];
|
||||||
// tokenize the input line by kernel_name unaligned aligned
|
while (config.getline(config_line, 255))
|
||||||
// then push back in the results vector with fields filled in
|
{
|
||||||
|
// tokenize the input line by kernel_name unaligned aligned
|
||||||
|
// then push back in the results vector with fields filled in
|
||||||
|
|
||||||
std::vector<std::string> single_kernel_result;
|
std::vector<std::string> single_kernel_result;
|
||||||
std::string config_str(config_line);
|
std::string config_str(config_line);
|
||||||
std::size_t str_size = config_str.size();
|
std::size_t str_size = config_str.size();
|
||||||
std::size_t found = 1;
|
std::size_t found = 1;
|
||||||
|
|
||||||
found = config_str.find(' ');
|
|
||||||
// Split line by spaces
|
|
||||||
while(found && found < str_size) {
|
|
||||||
found = config_str.find(' ');
|
found = config_str.find(' ');
|
||||||
// kernel names MUST be less than 128 chars, which is
|
// Split line by spaces
|
||||||
// a length restricted by volk/volk_prefs.c
|
while (found && found < str_size)
|
||||||
// on the last token in the parsed string we won't find a space
|
{
|
||||||
// so make sure we copy at most 128 chars.
|
found = config_str.find(' ');
|
||||||
if(found > 127) {
|
// kernel names MUST be less than 128 chars, which is
|
||||||
found = 127;
|
// a length restricted by volk/volk_prefs.c
|
||||||
}
|
// on the last token in the parsed string we won't find a space
|
||||||
str_size = config_str.size();
|
// so make sure we copy at most 128 chars.
|
||||||
char buffer[128] = {'\0'};
|
if (found > 127)
|
||||||
config_str.copy(buffer, found + 1, 0);
|
{
|
||||||
buffer[found] = '\0';
|
found = 127;
|
||||||
single_kernel_result.push_back(std::string(buffer));
|
}
|
||||||
config_str.erase(0, found+1);
|
str_size = config_str.size();
|
||||||
}
|
char buffer[128] = {'\0'};
|
||||||
|
config_str.copy(buffer, found + 1, 0);
|
||||||
|
buffer[found] = '\0';
|
||||||
|
single_kernel_result.push_back(std::string(buffer));
|
||||||
|
config_str.erase(0, found + 1);
|
||||||
|
}
|
||||||
|
|
||||||
if(single_kernel_result.size() == 3) {
|
if (single_kernel_result.size() == 3)
|
||||||
volk_gnsssdr_test_results_t kernel_result;
|
{
|
||||||
kernel_result.name = std::string(single_kernel_result[0]);
|
volk_gnsssdr_test_results_t kernel_result;
|
||||||
kernel_result.config_name = std::string(single_kernel_result[0]);
|
kernel_result.name = std::string(single_kernel_result[0]);
|
||||||
kernel_result.best_arch_u = std::string(single_kernel_result[1]);
|
kernel_result.config_name = std::string(single_kernel_result[0]);
|
||||||
kernel_result.best_arch_a = std::string(single_kernel_result[2]);
|
kernel_result.best_arch_u = std::string(single_kernel_result[1]);
|
||||||
results->push_back(kernel_result);
|
kernel_result.best_arch_a = std::string(single_kernel_result[2]);
|
||||||
}
|
results->push_back(kernel_result);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result)
|
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result)
|
||||||
@ -219,7 +242,7 @@ void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool
|
|||||||
char path[1024];
|
char path[1024];
|
||||||
volk_gnsssdr_get_config_path(path);
|
volk_gnsssdr_get_config_path(path);
|
||||||
|
|
||||||
write_results( results, update_result, std::string(path));
|
write_results(results, update_result, std::string(path));
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result, const std::string path)
|
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result, const std::string path)
|
||||||
@ -227,39 +250,44 @@ void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool
|
|||||||
const fs::path config_path(path);
|
const fs::path config_path(path);
|
||||||
// Until we can update the config on a kernel by kernel basis
|
// Until we can update the config on a kernel by kernel basis
|
||||||
// do not overwrite volk_gnsssdr_config when using a regex.
|
// do not overwrite volk_gnsssdr_config when using a regex.
|
||||||
if (! fs::exists(config_path.branch_path()))
|
if (!fs::exists(config_path.branch_path()))
|
||||||
{
|
{
|
||||||
std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl;
|
std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl;
|
||||||
fs::create_directories(config_path.branch_path());
|
fs::create_directories(config_path.branch_path());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ofstream config;
|
std::ofstream config;
|
||||||
if(update_result) {
|
if (update_result)
|
||||||
std::cout << "Updating " << path << " ..." << std::endl;
|
{
|
||||||
config.open(path.c_str(), std::ofstream::app);
|
std::cout << "Updating " << path << " ..." << std::endl;
|
||||||
if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
|
config.open(path.c_str(), std::ofstream::app);
|
||||||
std::cout << "Error opening file " << path << std::endl;
|
if (!config.is_open())
|
||||||
}
|
{ //either we don't have write access or we don't have the dir yet
|
||||||
}
|
std::cout << "Error opening file " << path << std::endl;
|
||||||
else {
|
}
|
||||||
std::cout << "Writing " << path << " ..." << std::endl;
|
|
||||||
config.open(path.c_str());
|
|
||||||
if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
|
|
||||||
std::cout << "Error opening file " << path << std::endl;
|
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::cout << "Writing " << path << " ..." << std::endl;
|
||||||
|
config.open(path.c_str());
|
||||||
|
if (!config.is_open())
|
||||||
|
{ //either we don't have write access or we don't have the dir yet
|
||||||
|
std::cout << "Error opening file " << path << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
config << "\
|
config << "\
|
||||||
#this file is generated by volk_gnsssdr_profile.\n\
|
#this file is generated by volk_gnsssdr_profile.\n\
|
||||||
#the function name is followed by the preferred architecture.\n\
|
#the function name is followed by the preferred architecture.\n\
|
||||||
";
|
";
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<volk_gnsssdr_test_results_t>::const_iterator profile_results;
|
std::vector<volk_gnsssdr_test_results_t>::const_iterator profile_results;
|
||||||
for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) {
|
for (profile_results = results->begin(); profile_results != results->end(); ++profile_results)
|
||||||
config << profile_results->config_name << " "
|
{
|
||||||
<< profile_results->best_arch_a << " "
|
config << profile_results->config_name << " "
|
||||||
<< profile_results->best_arch_u << std::endl;
|
<< profile_results->best_arch_a << " "
|
||||||
}
|
<< profile_results->best_arch_u << std::endl;
|
||||||
|
}
|
||||||
config.close();
|
config.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,43 +298,45 @@ void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_
|
|||||||
size_t len = results.size();
|
size_t len = results.size();
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
std::vector<volk_gnsssdr_test_results_t>::iterator result;
|
std::vector<volk_gnsssdr_test_results_t>::iterator result;
|
||||||
for(result = results.begin(); result != results.end(); ++result) {
|
for (result = results.begin(); result != results.end(); ++result)
|
||||||
json_file << " {" << std::endl;
|
{
|
||||||
json_file << " \"name\": \"" << result->name << "\"," << std::endl;
|
json_file << " {" << std::endl;
|
||||||
json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
|
json_file << " \"name\": \"" << result->name << "\"," << std::endl;
|
||||||
json_file << " \"iter\": " << result->iter << "," << std::endl;
|
json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
|
||||||
json_file << " \"best_arch_a\": \"" << result->best_arch_a
|
json_file << " \"iter\": " << result->iter << "," << std::endl;
|
||||||
<< "\"," << std::endl;
|
json_file << " \"best_arch_a\": \"" << result->best_arch_a
|
||||||
json_file << " \"best_arch_u\": \"" << result->best_arch_u
|
<< "\"," << std::endl;
|
||||||
<< "\"," << std::endl;
|
json_file << " \"best_arch_u\": \"" << result->best_arch_u
|
||||||
json_file << " \"results\": {" << std::endl;
|
<< "\"," << std::endl;
|
||||||
size_t results_len = result->results.size();
|
json_file << " \"results\": {" << std::endl;
|
||||||
size_t ri = 0;
|
size_t results_len = result->results.size();
|
||||||
|
size_t ri = 0;
|
||||||
|
|
||||||
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
|
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
|
||||||
for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) {
|
for (kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair)
|
||||||
volk_gnsssdr_test_time_t time = kernel_time_pair->second;
|
{
|
||||||
json_file << " \"" << time.name << "\": {" << std::endl;
|
volk_gnsssdr_test_time_t time = kernel_time_pair->second;
|
||||||
json_file << " \"name\": \"" << time.name << "\"," << std::endl;
|
json_file << " \"" << time.name << "\": {" << std::endl;
|
||||||
json_file << " \"time\": " << time.time << "," << std::endl;
|
json_file << " \"name\": \"" << time.name << "\"," << std::endl;
|
||||||
json_file << " \"units\": \"" << time.units << "\"" << std::endl;
|
json_file << " \"time\": " << time.time << "," << std::endl;
|
||||||
json_file << " }" ;
|
json_file << " \"units\": \"" << time.units << "\"" << std::endl;
|
||||||
if(ri+1 != results_len) {
|
json_file << " }";
|
||||||
json_file << ",";
|
if (ri + 1 != results_len)
|
||||||
}
|
{
|
||||||
|
json_file << ",";
|
||||||
|
}
|
||||||
|
json_file << std::endl;
|
||||||
|
ri++;
|
||||||
|
}
|
||||||
|
json_file << " }" << std::endl;
|
||||||
|
json_file << " }";
|
||||||
|
if (i + 1 != len)
|
||||||
|
{
|
||||||
|
json_file << ",";
|
||||||
|
}
|
||||||
json_file << std::endl;
|
json_file << std::endl;
|
||||||
ri++;
|
i++;
|
||||||
}
|
}
|
||||||
json_file << " }" << std::endl;
|
|
||||||
json_file << " }";
|
|
||||||
if(i+1 != len) {
|
|
||||||
json_file << ",";
|
|
||||||
}
|
|
||||||
json_file << std::endl;
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
json_file << " ]" << std::endl;
|
json_file << " ]" << std::endl;
|
||||||
json_file << "}" << std::endl;
|
json_file << "}" << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -27,10 +27,10 @@
|
|||||||
* -------------------------------------------------------------------------
|
* -------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <cstdbool> // for bool
|
#include <cstdbool> // for bool
|
||||||
#include <iosfwd> // for ofstream
|
#include <iosfwd> // for ofstream
|
||||||
#include <string> // for string
|
#include <string> // for string
|
||||||
#include <vector> // for vector
|
#include <vector> // for vector
|
||||||
|
|
||||||
class volk_test_results_t;
|
class volk_test_results_t;
|
||||||
|
|
||||||
|
@ -29,7 +29,7 @@
|
|||||||
|
|
||||||
static inline int16_t sat_adds16i(int16_t x, int16_t y)
|
static inline int16_t sat_adds16i(int16_t x, int16_t y)
|
||||||
{
|
{
|
||||||
int32_t res = (int32_t) x + (int32_t) y;
|
int32_t res = (int32_t)x + (int32_t)y;
|
||||||
|
|
||||||
if (res < SHRT_MIN) res = SHRT_MIN;
|
if (res < SHRT_MIN) res = SHRT_MIN;
|
||||||
if (res > SHRT_MAX) res = SHRT_MAX;
|
if (res > SHRT_MAX) res = SHRT_MAX;
|
||||||
@ -39,7 +39,7 @@ static inline int16_t sat_adds16i(int16_t x, int16_t y)
|
|||||||
|
|
||||||
static inline int16_t sat_muls16i(int16_t x, int16_t y)
|
static inline int16_t sat_muls16i(int16_t x, int16_t y)
|
||||||
{
|
{
|
||||||
int32_t res = (int32_t) x * (int32_t) y;
|
int32_t res = (int32_t)x * (int32_t)y;
|
||||||
|
|
||||||
if (res < SHRT_MIN) res = SHRT_MIN;
|
if (res < SHRT_MIN) res = SHRT_MIN;
|
||||||
if (res > SHRT_MAX) res = SHRT_MAX;
|
if (res > SHRT_MAX) res = SHRT_MAX;
|
||||||
|
@ -30,38 +30,42 @@
|
|||||||
static inline __m256
|
static inline __m256
|
||||||
_mm256_complexmul_ps(__m256 x, __m256 y)
|
_mm256_complexmul_ps(__m256 x, __m256 y)
|
||||||
{
|
{
|
||||||
__m256 yl, yh, tmp1, tmp2;
|
__m256 yl, yh, tmp1, tmp2;
|
||||||
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
|
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
|
||||||
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
|
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
|
||||||
tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
|
tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
|
||||||
x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
|
x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
|
||||||
tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256
|
static inline __m256
|
||||||
_mm256_conjugate_ps(__m256 x){
|
_mm256_conjugate_ps(__m256 x)
|
||||||
const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
|
{
|
||||||
return _mm256_xor_ps(x, conjugator); // conjugate y
|
const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
|
||||||
|
return _mm256_xor_ps(x, conjugator); // conjugate y
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256
|
static inline __m256
|
||||||
_mm256_complexconjugatemul_ps(__m256 x, __m256 y){
|
_mm256_complexconjugatemul_ps(__m256 x, __m256 y)
|
||||||
y = _mm256_conjugate_ps(y);
|
{
|
||||||
return _mm256_complexmul_ps(x, y);
|
y = _mm256_conjugate_ps(y);
|
||||||
|
return _mm256_complexmul_ps(x, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256
|
static inline __m256
|
||||||
_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
|
_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
|
||||||
__m256 complex1, complex2;
|
{
|
||||||
cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
|
__m256 complex1, complex2;
|
||||||
cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
|
cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
|
||||||
complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
|
cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
|
||||||
complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
|
complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
|
||||||
return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
|
complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
|
||||||
|
return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256 _mm256_complexnormalise_ps( __m256 z ){
|
static inline __m256 _mm256_complexnormalise_ps(__m256 z)
|
||||||
|
{
|
||||||
__m256 tmp1 = _mm256_mul_ps(z, z);
|
__m256 tmp1 = _mm256_mul_ps(z, z);
|
||||||
__m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
|
__m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
|
||||||
tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
|
tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
|
||||||
@ -70,8 +74,9 @@ static inline __m256 _mm256_complexnormalise_ps( __m256 z ){
|
|||||||
}
|
}
|
||||||
|
|
||||||
static inline __m256
|
static inline __m256
|
||||||
_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){
|
_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
|
||||||
return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
|
{
|
||||||
|
return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
|
#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */
|
||||||
|
@ -28,14 +28,14 @@
|
|||||||
// Cross-platform attribute macros not included in VOLK
|
// Cross-platform attribute macros not included in VOLK
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
# define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr)
|
#define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr)
|
||||||
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
|
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
|
||||||
#elif _MSC_VER
|
#elif _MSC_VER
|
||||||
# define __VOLK_GNSSSDR_PREFETCH(addr)
|
#define __VOLK_GNSSSDR_PREFETCH(addr)
|
||||||
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
|
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
|
||||||
#else
|
#else
|
||||||
# define __VOLK_GNSSSDR_PREFETCH(addr)
|
#define __VOLK_GNSSSDR_PREFETCH(addr)
|
||||||
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
|
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef INCLUDED_LIBVOLK_COMMON_H
|
#ifndef INCLUDED_LIBVOLK_COMMON_H
|
||||||
@ -45,45 +45,45 @@
|
|||||||
// Cross-platform attribute macros
|
// Cross-platform attribute macros
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
#if defined __GNUC__
|
#if defined __GNUC__
|
||||||
# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
|
#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
|
||||||
# define __VOLK_ATTR_UNUSED __attribute__((unused))
|
#define __VOLK_ATTR_UNUSED __attribute__((unused))
|
||||||
# define __VOLK_ATTR_INLINE __attribute__((always_inline))
|
#define __VOLK_ATTR_INLINE __attribute__((always_inline))
|
||||||
# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
|
#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
|
||||||
# define __VOLK_ASM __asm__
|
#define __VOLK_ASM __asm__
|
||||||
# define __VOLK_VOLATILE __volatile__
|
#define __VOLK_VOLATILE __volatile__
|
||||||
# if __GNUC__ >= 4
|
#if __GNUC__ >= 4
|
||||||
# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
|
#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
|
||||||
# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
|
#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
|
||||||
# else
|
|
||||||
# define __VOLK_ATTR_EXPORT
|
|
||||||
# define __VOLK_ATTR_IMPORT
|
|
||||||
# endif
|
|
||||||
#elif _MSC_VER
|
|
||||||
# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
|
|
||||||
# define __VOLK_ATTR_UNUSED
|
|
||||||
# define __VOLK_ATTR_INLINE __forceinline
|
|
||||||
# define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
|
|
||||||
# define __VOLK_ATTR_EXPORT __declspec(dllexport)
|
|
||||||
# define __VOLK_ATTR_IMPORT __declspec(dllimport)
|
|
||||||
# define __VOLK_ASM __asm
|
|
||||||
# define __VOLK_VOLATILE
|
|
||||||
#else
|
#else
|
||||||
# define __VOLK_ATTR_ALIGNED(x)
|
#define __VOLK_ATTR_EXPORT
|
||||||
# define __VOLK_ATTR_UNUSED
|
#define __VOLK_ATTR_IMPORT
|
||||||
# define __VOLK_ATTR_INLINE
|
#endif
|
||||||
# define __VOLK_ATTR_DEPRECATED
|
#elif _MSC_VER
|
||||||
# define __VOLK_ATTR_EXPORT
|
#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
|
||||||
# define __VOLK_ATTR_IMPORT
|
#define __VOLK_ATTR_UNUSED
|
||||||
# define __VOLK_ASM __asm__
|
#define __VOLK_ATTR_INLINE __forceinline
|
||||||
# define __VOLK_VOLATILE __volatile__
|
#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
|
||||||
|
#define __VOLK_ATTR_EXPORT __declspec(dllexport)
|
||||||
|
#define __VOLK_ATTR_IMPORT __declspec(dllimport)
|
||||||
|
#define __VOLK_ASM __asm
|
||||||
|
#define __VOLK_VOLATILE
|
||||||
|
#else
|
||||||
|
#define __VOLK_ATTR_ALIGNED(x)
|
||||||
|
#define __VOLK_ATTR_UNUSED
|
||||||
|
#define __VOLK_ATTR_INLINE
|
||||||
|
#define __VOLK_ATTR_DEPRECATED
|
||||||
|
#define __VOLK_ATTR_EXPORT
|
||||||
|
#define __VOLK_ATTR_IMPORT
|
||||||
|
#define __VOLK_ASM __asm__
|
||||||
|
#define __VOLK_VOLATILE __volatile__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
// Ignore annoying warnings in MSVC
|
// Ignore annoying warnings in MSVC
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
|
#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
|
||||||
# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2'
|
#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
@ -91,11 +91,13 @@
|
|||||||
// FIXME: due to the usage of complex.h, require gcc for c-linkage
|
// FIXME: due to the usage of complex.h, require gcc for c-linkage
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
#if defined(__cplusplus) && (__GNUC__)
|
#if defined(__cplusplus) && (__GNUC__)
|
||||||
# define __VOLK_DECL_BEGIN extern "C" {
|
#define __VOLK_DECL_BEGIN \
|
||||||
# define __VOLK_DECL_END }
|
extern "C" \
|
||||||
|
{
|
||||||
|
#define __VOLK_DECL_END }
|
||||||
#else
|
#else
|
||||||
# define __VOLK_DECL_BEGIN
|
#define __VOLK_DECL_BEGIN
|
||||||
# define __VOLK_DECL_END
|
#define __VOLK_DECL_END
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
@ -103,9 +105,9 @@
|
|||||||
// http://gcc.gnu.org/wiki/Visibility
|
// http://gcc.gnu.org/wiki/Visibility
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
#ifdef volk_gnsssdr_EXPORTS
|
#ifdef volk_gnsssdr_EXPORTS
|
||||||
# define VOLK_API __VOLK_ATTR_EXPORT
|
#define VOLK_API __VOLK_ATTR_EXPORT
|
||||||
#else
|
#else
|
||||||
# define VOLK_API __VOLK_ATTR_IMPORT
|
#define VOLK_API __VOLK_ATTR_IMPORT
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
@ -121,35 +123,37 @@
|
|||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
union bit128{
|
union bit128
|
||||||
uint8_t i8[16];
|
{
|
||||||
uint16_t i16[8];
|
uint8_t i8[16];
|
||||||
uint32_t i[4];
|
uint16_t i16[8];
|
||||||
float f[4];
|
uint32_t i[4];
|
||||||
double d[2];
|
float f[4];
|
||||||
|
double d[2];
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE
|
#ifdef LV_HAVE_SSE
|
||||||
__m128 float_vec;
|
__m128 float_vec;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
__m128i int_vec;
|
__m128i int_vec;
|
||||||
__m128d double_vec;
|
__m128d double_vec;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
union bit256{
|
union bit256
|
||||||
uint8_t i8[32];
|
{
|
||||||
uint16_t i16[16];
|
uint8_t i8[32];
|
||||||
uint32_t i[8];
|
uint16_t i16[16];
|
||||||
float f[8];
|
uint32_t i[8];
|
||||||
double d[4];
|
float f[8];
|
||||||
|
double d[4];
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX
|
#ifdef LV_HAVE_AVX
|
||||||
__m256 float_vec;
|
__m256 float_vec;
|
||||||
__m256i int_vec;
|
__m256i int_vec;
|
||||||
__m256d double_vec;
|
__m256d double_vec;
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
#define bit128_p(x) ((union bit128 *)(x))
|
#define bit128_p(x) ((union bit128 *)(x))
|
||||||
|
@ -48,26 +48,34 @@
|
|||||||
#include <complex>
|
#include <complex>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
typedef std::complex<int8_t> lv_8sc_t;
|
typedef std::complex<int8_t> lv_8sc_t;
|
||||||
typedef std::complex<int16_t> lv_16sc_t;
|
typedef std::complex<int16_t> lv_16sc_t;
|
||||||
typedef std::complex<int32_t> lv_32sc_t;
|
typedef std::complex<int32_t> lv_32sc_t;
|
||||||
typedef std::complex<int64_t> lv_64sc_t;
|
typedef std::complex<int64_t> lv_64sc_t;
|
||||||
typedef std::complex<float> lv_32fc_t;
|
typedef std::complex<float> lv_32fc_t;
|
||||||
typedef std::complex<double> lv_64fc_t;
|
typedef std::complex<double> lv_64fc_t;
|
||||||
|
|
||||||
template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){
|
template <typename T>
|
||||||
|
inline std::complex<T> lv_cmake(const T &r, const T &i)
|
||||||
|
{
|
||||||
return std::complex<T>(r, i);
|
return std::complex<T>(r, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> inline typename T::value_type lv_creal(const T &x){
|
template <typename T>
|
||||||
|
inline typename T::value_type lv_creal(const T &x)
|
||||||
|
{
|
||||||
return x.real();
|
return x.real();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> inline typename T::value_type lv_cimag(const T &x){
|
template <typename T>
|
||||||
|
inline typename T::value_type lv_cimag(const T &x)
|
||||||
|
{
|
||||||
return x.imag();
|
return x.imag();
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> inline T lv_conj(const T &x){
|
template <typename T>
|
||||||
|
inline T lv_conj(const T &x)
|
||||||
|
{
|
||||||
return std::conj(x);
|
return std::conj(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -80,14 +88,14 @@ template <typename T> inline T lv_conj(const T &x){
|
|||||||
|
|
||||||
#include <complex.h>
|
#include <complex.h>
|
||||||
|
|
||||||
typedef char complex lv_8sc_t;
|
typedef char complex lv_8sc_t;
|
||||||
typedef short complex lv_16sc_t;
|
typedef short complex lv_16sc_t;
|
||||||
typedef long complex lv_32sc_t;
|
typedef long complex lv_32sc_t;
|
||||||
typedef long long complex lv_64sc_t;
|
typedef long long complex lv_64sc_t;
|
||||||
typedef float complex lv_32fc_t;
|
typedef float complex lv_32fc_t;
|
||||||
typedef double complex lv_64fc_t;
|
typedef double complex lv_64fc_t;
|
||||||
|
|
||||||
#define lv_cmake(r, i) ((r) + _Complex_I*(i))
|
#define lv_cmake(r, i) ((r) + _Complex_I * (i))
|
||||||
|
|
||||||
// When GNUC is available, use the complex extensions.
|
// When GNUC is available, use the complex extensions.
|
||||||
// The extensions always return the correct value type.
|
// The extensions always return the correct value type.
|
||||||
|
@ -27,30 +27,30 @@
|
|||||||
|
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
static inline float32x4_t vdivq_f32( float32x4_t num, float32x4_t den )
|
static inline float32x4_t vdivq_f32(float32x4_t num, float32x4_t den)
|
||||||
{
|
{
|
||||||
const float32x4_t q_inv0 = vrecpeq_f32( den );
|
const float32x4_t q_inv0 = vrecpeq_f32(den);
|
||||||
const float32x4_t q_step0 = vrecpsq_f32( q_inv0, den );
|
const float32x4_t q_step0 = vrecpsq_f32(q_inv0, den);
|
||||||
|
|
||||||
const float32x4_t q_inv1 = vmulq_f32( q_step0, q_inv0 );
|
const float32x4_t q_inv1 = vmulq_f32(q_step0, q_inv0);
|
||||||
return vmulq_f32( num, q_inv1 );
|
return vmulq_f32(num, q_inv1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline float32x4_t vsqrtq_f32( float32x4_t q_x )
|
static inline float32x4_t vsqrtq_f32(float32x4_t q_x)
|
||||||
{
|
{
|
||||||
const float32x4_t q_step_0 = vrsqrteq_f32( q_x );
|
const float32x4_t q_step_0 = vrsqrteq_f32(q_x);
|
||||||
// step
|
// step
|
||||||
const float32x4_t q_step_parm0 = vmulq_f32( q_x, q_step_0 );
|
const float32x4_t q_step_parm0 = vmulq_f32(q_x, q_step_0);
|
||||||
const float32x4_t q_step_result0 = vrsqrtsq_f32( q_step_parm0, q_step_0 );
|
const float32x4_t q_step_result0 = vrsqrtsq_f32(q_step_parm0, q_step_0);
|
||||||
// step
|
// step
|
||||||
const float32x4_t q_step_1 = vmulq_f32( q_step_0, q_step_result0 );
|
const float32x4_t q_step_1 = vmulq_f32(q_step_0, q_step_result0);
|
||||||
const float32x4_t q_step_parm1 = vmulq_f32( q_x, q_step_1 );
|
const float32x4_t q_step_parm1 = vmulq_f32(q_x, q_step_1);
|
||||||
const float32x4_t q_step_result1 = vrsqrtsq_f32( q_step_parm1, q_step_1 );
|
const float32x4_t q_step_result1 = vrsqrtsq_f32(q_step_parm1, q_step_1);
|
||||||
// take the res
|
// take the res
|
||||||
const float32x4_t q_step_2 = vmulq_f32( q_step_1, q_step_result1 );
|
const float32x4_t q_step_2 = vmulq_f32(q_step_1, q_step_result1);
|
||||||
// mul by x to get sqrt, not rsqrt
|
// mul by x to get sqrt, not rsqrt
|
||||||
return vmulq_f32( q_x, q_step_2 );
|
return vmulq_f32(q_x, q_step_2);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */
|
#endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */
|
||||||
|
@ -32,9 +32,9 @@ __VOLK_DECL_BEGIN
|
|||||||
|
|
||||||
typedef struct volk_gnsssdr_arch_pref
|
typedef struct volk_gnsssdr_arch_pref
|
||||||
{
|
{
|
||||||
char name[128]; //name of the kernel
|
char name[128]; //name of the kernel
|
||||||
char impl_a[128]; //best aligned impl
|
char impl_a[128]; //best aligned impl
|
||||||
char impl_u[128]; //best unaligned impl
|
char impl_u[128]; //best unaligned impl
|
||||||
} volk_gnsssdr_arch_pref_t;
|
} volk_gnsssdr_arch_pref_t;
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -30,33 +30,35 @@
|
|||||||
static inline __m128
|
static inline __m128
|
||||||
_mm_complexmul_ps(__m128 x, __m128 y)
|
_mm_complexmul_ps(__m128 x, __m128 y)
|
||||||
{
|
{
|
||||||
__m128 yl, yh, tmp1, tmp2;
|
__m128 yl, yh, tmp1, tmp2;
|
||||||
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m128
|
static inline __m128
|
||||||
_mm_complexconjugatemul_ps(__m128 x, __m128 y)
|
_mm_complexconjugatemul_ps(__m128 x, __m128 y)
|
||||||
{
|
{
|
||||||
const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
|
const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
|
||||||
y = _mm_xor_ps(y, conjugator); // conjugate y
|
y = _mm_xor_ps(y, conjugator); // conjugate y
|
||||||
return _mm_complexmul_ps(x, y);
|
return _mm_complexmul_ps(x, y);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m128
|
static inline __m128
|
||||||
_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
|
_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
|
||||||
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
|
{
|
||||||
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
|
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
|
||||||
return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
|
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
|
||||||
|
return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m128
|
static inline __m128
|
||||||
_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
|
_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
|
||||||
return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
|
{
|
||||||
|
return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
|
#endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */
|
||||||
|
@ -27,20 +27,22 @@
|
|||||||
#include <xmmintrin.h>
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
static inline __m128
|
static inline __m128
|
||||||
_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){
|
_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
|
||||||
__m128 iValue, qValue;
|
{
|
||||||
// Arrange in i1i2i3i4 format
|
__m128 iValue, qValue;
|
||||||
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
|
// Arrange in i1i2i3i4 format
|
||||||
// Arrange in q1q2q3q4 format
|
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
|
||||||
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
|
// Arrange in q1q2q3q4 format
|
||||||
iValue = _mm_mul_ps(iValue, iValue); // Square the I values
|
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
|
||||||
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
|
iValue = _mm_mul_ps(iValue, iValue); // Square the I values
|
||||||
return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
|
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
|
||||||
|
return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline __m128
|
static inline __m128
|
||||||
_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){
|
_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
|
||||||
return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
|
{
|
||||||
|
return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* INCLUDED_VOLK_VOLK_SSE_INTRINSICS_H_ */
|
#endif /* INCLUDED_VOLK_VOLK_SSE_INTRINSICS_H_ */
|
||||||
|
@ -45,55 +45,55 @@
|
|||||||
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_GENERIC */
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,26 +103,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result
|
|||||||
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -133,26 +133,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result
|
|||||||
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -163,26 +163,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* resu
|
|||||||
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -193,26 +193,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* resu
|
|||||||
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -223,26 +223,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result,
|
|||||||
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -253,30 +253,29 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result,
|
|||||||
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H
|
#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H
|
||||||
|
|
||||||
|
@ -107,7 +107,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -121,7 +122,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -139,13 +140,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -157,7 +158,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE4_1
|
#ifdef LV_HAVE_SSE4_1
|
||||||
@ -173,7 +174,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -187,7 +189,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -205,13 +207,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -240,7 +242,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -254,7 +257,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -275,13 +278,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -310,7 +313,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -324,7 +328,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -345,13 +349,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -379,7 +383,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -394,7 +399,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -412,13 +417,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -428,7 +433,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -456,7 +461,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -471,7 +477,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -489,13 +495,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -505,7 +511,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -531,7 +537,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
|
|||||||
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
||||||
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int32_t local_code_chip_index[4];
|
||||||
int32_t local_code_chip_index_;
|
int32_t local_code_chip_index_;
|
||||||
|
|
||||||
const int32x4_t zeros = vdupq_n_s32(0);
|
const int32x4_t zeros = vdupq_n_s32(0);
|
||||||
@ -539,11 +546,12 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
|
|||||||
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
||||||
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
||||||
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
||||||
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
uint32x4_t igx;
|
uint32x4_t igx;
|
||||||
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
||||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
||||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
||||||
float32x4_t n0 = vld1q_f32((float*)vec);
|
float32x4_t n0 = vld1q_f32((float*)vec);
|
||||||
int current_correlator_tap;
|
int current_correlator_tap;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
@ -553,7 +561,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
|
|||||||
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < neon_iters; n++)
|
for (n = 0; n < neon_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
|
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
|
||||||
@ -569,7 +577,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
|
|||||||
|
|
||||||
// fmod
|
// fmod
|
||||||
c = vmulq_f32(aux, reciprocal);
|
c = vmulq_f32(aux, reciprocal);
|
||||||
i = vcvtq_s32_f32(c);
|
i = vcvtq_s32_f32(c);
|
||||||
cTrunc = vcvtq_f32_s32(i);
|
cTrunc = vcvtq_f32_s32(i);
|
||||||
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
||||||
aux = vsubq_f32(aux, base);
|
aux = vsubq_f32(aux, base);
|
||||||
@ -581,13 +589,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
|
|||||||
|
|
||||||
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
|
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = vaddq_f32(indexn, fours);
|
indexn = vaddq_f32(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = neon_iters * 4; n < num_points; n++)
|
for (n = neon_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
@ -605,4 +613,3 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
|
|||||||
|
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/
|
#endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -41,7 +41,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.345;
|
float rem_carrier_phase_in_rad = 0.345;
|
||||||
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.345;
|
float rem_carrier_phase_in_rad = 0.345;
|
||||||
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_re
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -113,50 +113,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SSE3
|
#endif // SSE3
|
||||||
|
|
||||||
|
|
||||||
//#ifdef LV_HAVE_SSE3
|
//#ifdef LV_HAVE_SSE3
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
//{
|
//{
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
//float rem_carrier_phase_in_rad = 0.345;
|
||||||
//float phase_step_rad = 0.1;
|
//float phase_step_rad = 0.1;
|
||||||
//lv_32fc_t phase[1];
|
//lv_32fc_t phase[1];
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
||||||
//lv_32fc_t phase_inc[1];
|
//lv_32fc_t phase_inc[1];
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
//unsigned int n;
|
//unsigned int n;
|
||||||
//int num_a_vectors = 3;
|
//int num_a_vectors = 3;
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
//volk_gnsssdr_free(in_a[n]);
|
||||||
//}
|
//}
|
||||||
//volk_gnsssdr_free(in_a);
|
//volk_gnsssdr_free(in_a);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//#endif // SSE3
|
//#endif // SSE3
|
||||||
@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SSE3
|
#endif // SSE3
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
@ -206,50 +206,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
|
|
||||||
//#ifdef LV_HAVE_AVX2
|
//#ifdef LV_HAVE_AVX2
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
//{
|
//{
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
//float rem_carrier_phase_in_rad = 0.345;
|
||||||
//float phase_step_rad = 0.1;
|
//float phase_step_rad = 0.1;
|
||||||
//lv_32fc_t phase[1];
|
//lv_32fc_t phase[1];
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
||||||
//lv_32fc_t phase_inc[1];
|
//lv_32fc_t phase_inc[1];
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
//unsigned int n;
|
//unsigned int n;
|
||||||
//int num_a_vectors = 3;
|
//int num_a_vectors = 3;
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
//volk_gnsssdr_free(in_a[n]);
|
||||||
//}
|
//}
|
||||||
//volk_gnsssdr_free(in_a);
|
//volk_gnsssdr_free(in_a);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//#endif // AVX2
|
//#endif // AVX2
|
||||||
@ -268,50 +268,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
|
|
||||||
//#ifdef LV_HAVE_AVX2
|
//#ifdef LV_HAVE_AVX2
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
//{
|
//{
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
//float rem_carrier_phase_in_rad = 0.345;
|
||||||
//float phase_step_rad = 0.1;
|
//float phase_step_rad = 0.1;
|
||||||
//lv_32fc_t phase[1];
|
//lv_32fc_t phase[1];
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
||||||
//lv_32fc_t phase_inc[1];
|
//lv_32fc_t phase_inc[1];
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
//unsigned int n;
|
//unsigned int n;
|
||||||
//int num_a_vectors = 3;
|
//int num_a_vectors = 3;
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
//volk_gnsssdr_free(in_a[n]);
|
||||||
//}
|
//}
|
||||||
//volk_gnsssdr_free(in_a);
|
//volk_gnsssdr_free(in_a);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//#endif // AVX2
|
//#endif // AVX2
|
||||||
@ -320,29 +320,29 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
|
|||||||
//#ifdef LV_HAVE_NEON
|
//#ifdef LV_HAVE_NEON
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
//{
|
//{
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
//float rem_carrier_phase_in_rad = 0.345;
|
||||||
//float phase_step_rad = 0.1;
|
//float phase_step_rad = 0.1;
|
||||||
//lv_32fc_t phase[1];
|
//lv_32fc_t phase[1];
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
||||||
//lv_32fc_t phase_inc[1];
|
//lv_32fc_t phase_inc[1];
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
//unsigned int n;
|
//unsigned int n;
|
||||||
//int num_a_vectors = 3;
|
//int num_a_vectors = 3;
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
//volk_gnsssdr_free(in_a[n]);
|
||||||
//}
|
//}
|
||||||
//volk_gnsssdr_free(in_a);
|
//volk_gnsssdr_free(in_a);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//#endif // NEON
|
//#endif // NEON
|
||||||
@ -351,34 +351,31 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
|
|||||||
//#ifdef LV_HAVE_NEON
|
//#ifdef LV_HAVE_NEON
|
||||||
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
//{
|
//{
|
||||||
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
//// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
//float rem_carrier_phase_in_rad = 0.345;
|
//float rem_carrier_phase_in_rad = 0.345;
|
||||||
//float phase_step_rad = 0.1;
|
//float phase_step_rad = 0.1;
|
||||||
//lv_32fc_t phase[1];
|
//lv_32fc_t phase[1];
|
||||||
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
|
||||||
//lv_32fc_t phase_inc[1];
|
//lv_32fc_t phase_inc[1];
|
||||||
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
//unsigned int n;
|
//unsigned int n;
|
||||||
//int num_a_vectors = 3;
|
//int num_a_vectors = 3;
|
||||||
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
//for(n = 0; n < num_a_vectors; n++)
|
//for(n = 0; n < num_a_vectors; n++)
|
||||||
//{
|
//{
|
||||||
//volk_gnsssdr_free(in_a[n]);
|
//volk_gnsssdr_free(in_a[n]);
|
||||||
//}
|
//}
|
||||||
//volk_gnsssdr_free(in_a);
|
//volk_gnsssdr_free(in_a);
|
||||||
//}
|
//}
|
||||||
|
|
||||||
//#endif // NEON
|
//#endif // NEON
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H
|
#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -68,7 +68,7 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_generic(lv_16sc_t* cVector,
|
|||||||
const lv_16sc_t* aPtr = aVector;
|
const lv_16sc_t* aPtr = aVector;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
for(number = 0; number < num_points; number++)
|
for (number = 0; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*cPtr++ = lv_conj(*aPtr++);
|
*cPtr++ = lv_conj(*aPtr++);
|
||||||
}
|
}
|
||||||
@ -231,4 +231,3 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c
|
|||||||
//#endif /* LV_HAVE_NEON */
|
//#endif /* LV_HAVE_NEON */
|
||||||
|
|
||||||
#endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */
|
#endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@
|
|||||||
static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
|
||||||
{
|
{
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i = 0; i < num_points; i++)
|
for (i = 0; i < num_points; i++)
|
||||||
{
|
{
|
||||||
outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
|
outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
|
||||||
}
|
}
|
||||||
@ -82,9 +82,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector
|
|||||||
lv_32fc_t* _out = outputVector;
|
lv_32fc_t* _out = outputVector;
|
||||||
__m128 a;
|
__m128 a;
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
_mm_store_ps((float*)_out, a);
|
_mm_store_ps((float*)_out, a);
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 2;
|
_out += 2;
|
||||||
@ -109,9 +109,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector
|
|||||||
lv_32fc_t* _out = outputVector;
|
lv_32fc_t* _out = outputVector;
|
||||||
__m128 a;
|
__m128 a;
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
_mm_storeu_ps((float*)_out, a);
|
_mm_storeu_ps((float*)_out, a);
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 2;
|
_out += 2;
|
||||||
@ -136,15 +136,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector,
|
|||||||
lv_32fc_t* _out = outputVector;
|
lv_32fc_t* _out = outputVector;
|
||||||
__m256 a;
|
__m256 a;
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
_mm256_storeu_ps((float*)_out, a);
|
_mm256_storeu_ps((float*)_out, a);
|
||||||
_in += 4;
|
_in += 4;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for(i = 0; i < (num_points % 4); ++i)
|
for (i = 0; i < (num_points % 4); ++i)
|
||||||
{
|
{
|
||||||
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
|
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
|
||||||
_in++;
|
_in++;
|
||||||
@ -163,15 +163,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector,
|
|||||||
lv_32fc_t* _out = outputVector;
|
lv_32fc_t* _out = outputVector;
|
||||||
__m256 a;
|
__m256 a;
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
_mm256_store_ps((float*)_out, a);
|
_mm256_store_ps((float*)_out, a);
|
||||||
_in += 4;
|
_in += 4;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for(i = 0; i < (num_points % 4); ++i)
|
for (i = 0; i < (num_points % 4); ++i)
|
||||||
{
|
{
|
||||||
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
|
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
|
||||||
_in++;
|
_in++;
|
||||||
@ -194,7 +194,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
|
|||||||
int32x4_t a32x4;
|
int32x4_t a32x4;
|
||||||
float32x4_t f32x4;
|
float32x4_t f32x4;
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
a16x4 = vld1_s16((const int16_t*)_in);
|
a16x4 = vld1_s16((const int16_t*)_in);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 4);
|
__VOLK_GNSSSDR_PREFETCH(_in + 4);
|
||||||
|
@ -78,7 +78,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
|
|||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f);
|
local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f);
|
||||||
if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
|
if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
|
||||||
if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips;
|
if (local_code_chip_index > (code_length_chips - 1)) local_code_chip_index -= code_length_chips;
|
||||||
result[n] = local_code[local_code_chip_index];
|
result[n] = local_code[local_code_chip_index];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -89,61 +89,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
|
|||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float)
|
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
|
||||||
{
|
{
|
||||||
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
|
|
||||||
lv_16sc_t* _result = result;
|
lv_16sc_t* _result = result;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
__m128 _rem_code_phase, _code_phase_step_chips;
|
__m128 _rem_code_phase, _code_phase_step_chips;
|
||||||
__m128i _code_length_chips, _code_length_chips_minus1;
|
__m128i _code_length_chips, _code_length_chips_minus1;
|
||||||
__m128 _code_phase_out, _code_phase_out_with_offset;
|
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||||
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
||||||
|
|
||||||
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
|
||||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips-1;
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[1] = code_length_chips-1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[2] = code_length_chips-1;
|
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[3] = code_length_chips-1;
|
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||||
|
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips[4];
|
||||||
four_times_code_length_chips[0] = code_length_chips;
|
four_times_code_length_chips[0] = code_length_chips;
|
||||||
four_times_code_length_chips[1] = code_length_chips;
|
four_times_code_length_chips[1] = code_length_chips;
|
||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
||||||
|
|
||||||
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
|
|
||||||
__m128i zero = _mm_setzero_si128();
|
__m128i zero = _mm_setzero_si128();
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
__m128 _4output_index = _mm_load_ps(init_idx_float);
|
__m128 _4output_index = _mm_load_ps(init_idx_float);
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
|
||||||
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
|
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
|
||||||
|
|
||||||
for(number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
||||||
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
||||||
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
||||||
|
|
||||||
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
||||||
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
|
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
||||||
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
|
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
//todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
*_result++ = local_code[local_code_chip_index[0]];
|
*_result++ = local_code[local_code_chip_index[0]];
|
||||||
@ -154,7 +159,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
|
|||||||
_4output_index = _mm_add_ps(_4output_index, _4constant_float);
|
_4output_index = _mm_add_ps(_4output_index, _4constant_float);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = quarterPoints * 4; number < num_output_samples; number++)
|
for (number = quarterPoints * 4; number < num_output_samples; number++)
|
||||||
{
|
{
|
||||||
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
|
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
|
||||||
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
||||||
@ -169,61 +174,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
|
|||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float)
|
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
|
||||||
{
|
{
|
||||||
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
|
|
||||||
lv_16sc_t* _result = result;
|
lv_16sc_t* _result = result;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
__m128 _rem_code_phase, _code_phase_step_chips;
|
__m128 _rem_code_phase, _code_phase_step_chips;
|
||||||
__m128i _code_length_chips, _code_length_chips_minus1;
|
__m128i _code_length_chips, _code_length_chips_minus1;
|
||||||
__m128 _code_phase_out, _code_phase_out_with_offset;
|
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||||
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
||||||
|
|
||||||
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
|
||||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips-1;
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[1] = code_length_chips-1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[2] = code_length_chips-1;
|
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[3] = code_length_chips-1;
|
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||||
|
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips[4];
|
||||||
four_times_code_length_chips[0] = code_length_chips;
|
four_times_code_length_chips[0] = code_length_chips;
|
||||||
four_times_code_length_chips[1] = code_length_chips;
|
four_times_code_length_chips[1] = code_length_chips;
|
||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
||||||
|
|
||||||
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
|
|
||||||
__m128i zero = _mm_setzero_si128();
|
__m128i zero = _mm_setzero_si128();
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
__m128 _4output_index = _mm_loadu_ps(init_idx_float);
|
__m128 _4output_index = _mm_loadu_ps(init_idx_float);
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
|
||||||
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
|
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
|
||||||
|
|
||||||
for(number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
||||||
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
||||||
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
||||||
|
|
||||||
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
||||||
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
|
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
||||||
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
|
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
//todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
*_result++ = local_code[local_code_chip_index[0]];
|
*_result++ = local_code[local_code_chip_index[0]];
|
||||||
@ -234,7 +244,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
|
|||||||
_4output_index = _mm_add_ps(_4output_index, _4constant_float);
|
_4output_index = _mm_add_ps(_4output_index, _4constant_float);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = quarterPoints * 4; number < num_output_samples; number++)
|
for (number = quarterPoints * 4; number < num_output_samples; number++)
|
||||||
{
|
{
|
||||||
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
|
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
|
||||||
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
||||||
@ -249,7 +259,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
|
|||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float)
|
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
|
||||||
{
|
{
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
@ -257,57 +267,62 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
|
|||||||
|
|
||||||
lv_16sc_t* _result = result;
|
lv_16sc_t* _result = result;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
float32x4_t _rem_code_phase, _code_phase_step_chips;
|
float32x4_t _rem_code_phase, _code_phase_step_chips;
|
||||||
int32x4_t _code_length_chips, _code_length_chips_minus1;
|
int32x4_t _code_length_chips, _code_length_chips_minus1;
|
||||||
float32x4_t _code_phase_out, _code_phase_out_with_offset;
|
float32x4_t _code_phase_out, _code_phase_out_with_offset;
|
||||||
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
|
||||||
float32x4_t sign, PlusHalf, Round;
|
float32x4_t sign, PlusHalf, Round;
|
||||||
|
|
||||||
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register
|
||||||
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips[4];
|
||||||
four_times_code_length_chips[0] = code_length_chips;
|
four_times_code_length_chips[0] = code_length_chips;
|
||||||
four_times_code_length_chips[1] = code_length_chips;
|
four_times_code_length_chips[1] = code_length_chips;
|
||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
||||||
|
|
||||||
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
uint32x4_t negative_indexes, overflow_indexes;
|
uint32x4_t negative_indexes, overflow_indexes;
|
||||||
int32x4_t zero = vmovq_n_s32(0);
|
int32x4_t zero = vmovq_n_s32(0);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
float32x4_t _4output_index = vld1q_f32(init_idx_float);
|
float32x4_t _4output_index = vld1q_f32(init_idx_float);
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
|
||||||
float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
|
float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
|
||||||
|
|
||||||
for(number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
||||||
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
|
||||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
|
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
|
||||||
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
|
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
|
||||||
Round = vsubq_f32(PlusHalf, sign);
|
Round = vsubq_f32(PlusHalf, sign);
|
||||||
_code_phase_out_int = vcvtq_s32_f32(Round);
|
_code_phase_out_int = vcvtq_s32_f32(Round);
|
||||||
|
|
||||||
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
|
||||||
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int )));
|
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
||||||
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg )));
|
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
//todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
*_result++ = local_code[local_code_chip_index[0]];
|
*_result++ = local_code[local_code_chip_index[0]];
|
||||||
@ -318,7 +333,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
|
|||||||
_4output_index = vaddq_f32(_4output_index, _4constant_float);
|
_4output_index = vaddq_f32(_4output_index, _4constant_float);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = quarterPoints * 4; number < num_output_samples; number++)
|
for (number = quarterPoints * 4; number < num_output_samples; number++)
|
||||||
{
|
{
|
||||||
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
|
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
|
||||||
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
||||||
|
@ -44,7 +44,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_generic(lv_16sc_t*
|
|||||||
float rem_code_phase_chips = -0.123;
|
float rem_code_phase_chips = -0.123;
|
||||||
float code_phase_step_chips = 0.1;
|
float code_phase_step_chips = 0.1;
|
||||||
int code_length_chips = 1023;
|
int code_length_chips = 1023;
|
||||||
volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
|
volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_GENERIC */
|
#endif /* LV_HAVE_GENERIC */
|
||||||
@ -55,7 +55,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_a_sse2(lv_16sc_t*
|
|||||||
float rem_code_phase_chips = -0.123;
|
float rem_code_phase_chips = -0.123;
|
||||||
float code_phase_step_chips = 0.1;
|
float code_phase_step_chips = 0.1;
|
||||||
int code_length_chips = 1023;
|
int code_length_chips = 1023;
|
||||||
volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points );
|
volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
@ -67,7 +67,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_u_sse2(lv_16sc_t*
|
|||||||
float rem_code_phase_chips = -0.123;
|
float rem_code_phase_chips = -0.123;
|
||||||
float code_phase_step_chips = 0.1;
|
float code_phase_step_chips = 0.1;
|
||||||
int code_length_chips = 1023;
|
int code_length_chips = 1023;
|
||||||
volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points );
|
volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
@ -79,9 +79,9 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_neon(lv_16sc_t* re
|
|||||||
float rem_code_phase_chips = -0.123;
|
float rem_code_phase_chips = -0.123;
|
||||||
float code_phase_step_chips = 0.1;
|
float code_phase_step_chips = 0.1;
|
||||||
int code_length_chips = 1023;
|
int code_length_chips = 1023;
|
||||||
volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points );
|
volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_NEON */
|
#endif /* LV_HAVE_NEON */
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H
|
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H
|
||||||
|
@ -49,21 +49,21 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_generic(lv_16sc_
|
|||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
rem_code_phase_chips[n] = -0.234;
|
rem_code_phase_chips[n] = -0.234;
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
volk_gnsssdr_free(rem_code_phase_chips);
|
volk_gnsssdr_free(rem_code_phase_chips);
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,22 +77,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_a_sse2(lv_16sc_t
|
|||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
rem_code_phase_chips[n] = -0.234;
|
rem_code_phase_chips[n] = -0.234;
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
volk_gnsssdr_free(rem_code_phase_chips);
|
volk_gnsssdr_free(rem_code_phase_chips);
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -106,22 +106,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_u_sse2(lv_16sc_t
|
|||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
rem_code_phase_chips[n] = -0.234;
|
rem_code_phase_chips[n] = -0.234;
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
volk_gnsssdr_free(rem_code_phase_chips);
|
volk_gnsssdr_free(rem_code_phase_chips);
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -135,26 +135,26 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_neon(lv_16sc_t*
|
|||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
rem_code_phase_chips[n] = -0.234;
|
rem_code_phase_chips[n] = -0.234;
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
volk_gnsssdr_free(rem_code_phase_chips);
|
volk_gnsssdr_free(rem_code_phase_chips);
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H
|
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H
|
||||||
|
@ -45,56 +45,56 @@
|
|||||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_GENERIC */
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,26 +104,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re
|
|||||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -134,26 +134,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re
|
|||||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -164,26 +164,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t*
|
|||||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,26 +194,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t*
|
|||||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -224,26 +224,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res
|
|||||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -254,29 +254,29 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res
|
|||||||
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H
|
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H
|
||||||
|
@ -70,7 +70,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
|
|||||||
unsigned int i = 0;
|
unsigned int i = 0;
|
||||||
lv_16sc_t tmp16;
|
lv_16sc_t tmp16;
|
||||||
lv_32fc_t tmp32;
|
lv_32fc_t tmp32;
|
||||||
for(i = 0; i < (unsigned int)(num_points); ++i)
|
for (i = 0; i < (unsigned int)(num_points); ++i)
|
||||||
{
|
{
|
||||||
tmp16 = *inVector++;
|
tmp16 = *inVector++;
|
||||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||||
@ -111,8 +111,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s
|
|||||||
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
}
|
}
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
@ -141,11 +141,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
unsigned int number;
|
unsigned int number;
|
||||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||||
__m128i c1, c2, result;
|
__m128i c1, c2, result;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_inc[2];
|
||||||
two_phase_inc[0] = phase_inc * phase_inc;
|
two_phase_inc[0] = phase_inc * phase_inc;
|
||||||
two_phase_inc[1] = phase_inc * phase_inc;
|
two_phase_inc[1] = phase_inc * phase_inc;
|
||||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_acc[2];
|
||||||
two_phase_acc[0] = (*phase);
|
two_phase_acc[0] = (*phase);
|
||||||
two_phase_acc[1] = (*phase) * phase_inc;
|
two_phase_acc[1] = (*phase) * phase_inc;
|
||||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||||
@ -157,49 +159,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
lv_16sc_t tmp16;
|
lv_16sc_t tmp16;
|
||||||
lv_32fc_t tmp32;
|
lv_32fc_t tmp32;
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
// store four output samples
|
// store four output samples
|
||||||
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_store_si128((__m128i*)_out, result);
|
_mm_store_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
@ -232,7 +234,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
|
|||||||
#endif /* LV_HAVE_SSE3 */
|
#endif /* LV_HAVE_SSE3 */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
|
|
||||||
@ -244,11 +245,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
unsigned int j;
|
unsigned int j;
|
||||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||||
__m128i c1, c2, result;
|
__m128i c1, c2, result;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_inc[2];
|
||||||
two_phase_inc[0] = phase_inc * phase_inc;
|
two_phase_inc[0] = phase_inc * phase_inc;
|
||||||
two_phase_inc[1] = phase_inc * phase_inc;
|
two_phase_inc[1] = phase_inc * phase_inc;
|
||||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_acc[2];
|
||||||
two_phase_acc[0] = (*phase);
|
two_phase_acc[0] = (*phase);
|
||||||
two_phase_acc[1] = (*phase) * phase_inc;
|
two_phase_acc[1] = (*phase) * phase_inc;
|
||||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||||
@ -265,47 +268,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
{
|
{
|
||||||
for (j = 0; j < ROTATOR_RELOAD; j++)
|
for (j = 0; j < ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
// store four output samples
|
// store four output samples
|
||||||
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_store_si128((__m128i*)_out, result);
|
_mm_store_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
@ -322,47 +325,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
|
|
||||||
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
// store four output samples
|
// store four output samples
|
||||||
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_store_si128((__m128i*)_out, result);
|
_mm_store_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
@ -385,7 +388,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
|
|||||||
#endif /* LV_HAVE_SSE3 */
|
#endif /* LV_HAVE_SSE3 */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
|
|
||||||
@ -395,14 +397,16 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
unsigned int number;
|
unsigned int number;
|
||||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||||
__m128i c1, c2, result;
|
__m128i c1, c2, result;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_inc[2];
|
||||||
two_phase_inc[0] = phase_inc * phase_inc;
|
two_phase_inc[0] = phase_inc * phase_inc;
|
||||||
two_phase_inc[1] = phase_inc * phase_inc;
|
two_phase_inc[1] = phase_inc * phase_inc;
|
||||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_acc[2];
|
||||||
two_phase_acc[0] = (*phase);
|
two_phase_acc[0] = (*phase);
|
||||||
two_phase_acc[1] = (*phase) * phase_inc;
|
two_phase_acc[1] = (*phase) * phase_inc;
|
||||||
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
|
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||||
|
|
||||||
const lv_16sc_t* _in = inVector;
|
const lv_16sc_t* _in = inVector;
|
||||||
|
|
||||||
@ -412,49 +416,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
lv_16sc_t tmp16;
|
lv_16sc_t tmp16;
|
||||||
lv_32fc_t tmp32;
|
lv_32fc_t tmp32;
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
// store four output samples
|
// store four output samples
|
||||||
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_storeu_si128((__m128i*)_out, result);
|
_mm_storeu_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
@ -493,147 +497,149 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
|
|||||||
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 4;
|
const unsigned int sse_iters = num_points / 4;
|
||||||
unsigned int ROTATOR_RELOAD = 512;
|
unsigned int ROTATOR_RELOAD = 512;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
unsigned int j;
|
unsigned int j;
|
||||||
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
|
||||||
__m128i c1, c2, result;
|
__m128i c1, c2, result;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
two_phase_inc[0] = phase_inc * phase_inc;
|
lv_32fc_t two_phase_inc[2];
|
||||||
two_phase_inc[1] = phase_inc * phase_inc;
|
two_phase_inc[0] = phase_inc * phase_inc;
|
||||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
two_phase_inc[1] = phase_inc * phase_inc;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
|
||||||
two_phase_acc[0] = (*phase);
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
two_phase_acc[1] = (*phase) * phase_inc;
|
lv_32fc_t two_phase_acc[2];
|
||||||
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
|
two_phase_acc[0] = (*phase);
|
||||||
|
two_phase_acc[1] = (*phase) * phase_inc;
|
||||||
|
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||||
|
|
||||||
const lv_16sc_t* _in = inVector;
|
const lv_16sc_t* _in = inVector;
|
||||||
|
|
||||||
lv_16sc_t* _out = outVector;
|
lv_16sc_t* _out = outVector;
|
||||||
|
|
||||||
__m128 yl, yh, tmp1, tmp2, tmp3;
|
__m128 yl, yh, tmp1, tmp2, tmp3;
|
||||||
lv_16sc_t tmp16;
|
lv_16sc_t tmp16;
|
||||||
lv_32fc_t tmp32;
|
lv_32fc_t tmp32;
|
||||||
|
|
||||||
for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
|
for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
|
||||||
{
|
{
|
||||||
for (j = 0; j < ROTATOR_RELOAD; j++)
|
for (j = 0; j < ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
// store four output samples
|
// store four output samples
|
||||||
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_storeu_si128((__m128i*)_out, result);
|
_mm_storeu_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
|
tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
|
||||||
tmp2 = _mm_hadd_ps(tmp1, tmp1);
|
tmp2 = _mm_hadd_ps(tmp1, tmp1);
|
||||||
tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
|
tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
|
||||||
tmp2 = _mm_sqrt_ps(tmp1);
|
tmp2 = _mm_sqrt_ps(tmp1);
|
||||||
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
|
||||||
{
|
{
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in + 8);
|
||||||
//complex 32fc multiplication b=a*two_phase_acc_reg
|
//complex 32fc multiplication b=a*two_phase_acc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
|
||||||
|
|
||||||
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
|
||||||
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
|
||||||
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
|
||||||
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
|
||||||
|
|
||||||
// store four output samples
|
// store four output samples
|
||||||
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
|
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
|
||||||
_mm_storeu_si128((__m128i*)_out, result);
|
_mm_storeu_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
_in += 2;
|
_in += 2;
|
||||||
_out += 4;
|
_out += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||||
(*phase) = two_phase_acc[0];
|
(*phase) = two_phase_acc[0];
|
||||||
|
|
||||||
for (n = sse_iters * 4; n < num_points; ++n)
|
for (n = sse_iters * 4; n < num_points; ++n)
|
||||||
{
|
{
|
||||||
tmp16 = *_in++;
|
tmp16 = *_in++;
|
||||||
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
|
||||||
*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_SSE3 */
|
#endif /* LV_HAVE_SSE3 */
|
||||||
@ -657,8 +663,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
|
|||||||
lv_16sc_t* _out = outVector;
|
lv_16sc_t* _out = outVector;
|
||||||
|
|
||||||
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
|
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
|
||||||
|
|
||||||
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
|
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
|
||||||
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
|
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
|
||||||
@ -667,8 +675,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
|
|||||||
lv_32fc_t phase3 = phase2 * phase_inc;
|
lv_32fc_t phase3 = phase2 * phase_inc;
|
||||||
lv_32fc_t phase4 = phase3 * phase_inc;
|
lv_32fc_t phase4 = phase3 * phase_inc;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
|
float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
|
||||||
|
|
||||||
float32x4_t _phase_real = vld1q_f32(__phase_real);
|
float32x4_t _phase_real = vld1q_f32(__phase_real);
|
||||||
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
|
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
|
||||||
@ -681,7 +691,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
|
|||||||
|
|
||||||
if (neon_iters > 0)
|
if (neon_iters > 0)
|
||||||
{
|
{
|
||||||
for(; i < neon_iters; ++i)
|
for (; i < neon_iters; ++i)
|
||||||
{
|
{
|
||||||
/* load 4 complex numbers (int 16 bits each component) */
|
/* load 4 complex numbers (int 16 bits each component) */
|
||||||
tmp16 = vld2_s16((int16_t*)_in);
|
tmp16 = vld2_s16((int16_t*)_in);
|
||||||
@ -745,8 +755,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
|
|||||||
phase3 = phase2 * phase_inc;
|
phase3 = phase2 * phase_inc;
|
||||||
phase4 = phase3 * phase_inc;
|
phase4 = phase3 * phase_inc;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
|
float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
|
||||||
|
|
||||||
_phase_real = vld1q_f32(____phase_real);
|
_phase_real = vld1q_f32(____phase_real);
|
||||||
_phase_imag = vld1q_f32(____phase_imag);
|
_phase_imag = vld1q_f32(____phase_imag);
|
||||||
@ -757,7 +769,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
|
|||||||
|
|
||||||
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
|
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
|
||||||
}
|
}
|
||||||
for(i = 0; i < neon_iters % 4; ++i)
|
for (i = 0; i < neon_iters % 4; ++i)
|
||||||
{
|
{
|
||||||
tmp16_ = *_in++;
|
tmp16_ = *_in++;
|
||||||
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
||||||
@ -791,8 +803,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
|
|||||||
lv_16sc_t* _out = outVector;
|
lv_16sc_t* _out = outVector;
|
||||||
|
|
||||||
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
|
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
|
||||||
|
|
||||||
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
|
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
|
||||||
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
|
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
|
||||||
@ -801,8 +815,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
|
|||||||
lv_32fc_t phase3 = phase2 * phase_inc;
|
lv_32fc_t phase3 = phase2 * phase_inc;
|
||||||
lv_32fc_t phase4 = phase3 * phase_inc;
|
lv_32fc_t phase4 = phase3 * phase_inc;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
|
float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
|
||||||
|
|
||||||
float32x4_t _phase_real = vld1q_f32(__phase_real);
|
float32x4_t _phase_real = vld1q_f32(__phase_real);
|
||||||
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
|
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
|
||||||
@ -879,8 +895,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
|
|||||||
phase3 = phase2 * phase_inc;
|
phase3 = phase2 * phase_inc;
|
||||||
phase4 = phase3 * phase_inc;
|
phase4 = phase3 * phase_inc;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
|
float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
|
||||||
|
|
||||||
_phase_real = vld1q_f32(____phase_real);
|
_phase_real = vld1q_f32(____phase_real);
|
||||||
_phase_imag = vld1q_f32(____phase_imag);
|
_phase_imag = vld1q_f32(____phase_imag);
|
||||||
@ -945,7 +963,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
|
|||||||
|
|
||||||
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
|
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
|
||||||
}
|
}
|
||||||
for(i = 0; i < neon_iters % 4; ++i)
|
for (i = 0; i < neon_iters % 4; ++i)
|
||||||
{
|
{
|
||||||
tmp16_ = *_in++;
|
tmp16_ = *_in++;
|
||||||
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
|
||||||
|
@ -73,7 +73,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
|
|||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = in_a[n] * in_b[n];
|
lv_16sc_t tmp = in_a[n] * in_b[n];
|
||||||
result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) ));
|
result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,7 +96,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
|
|||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
|
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
realcacc = _mm_setzero_si128();
|
realcacc = _mm_setzero_si128();
|
||||||
imagcacc = _mm_setzero_si128();
|
imagcacc = _mm_setzero_si128();
|
||||||
@ -104,25 +105,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
|
|||||||
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
|
||||||
b = _mm_load_si128((__m128i*)_in_b);
|
b = _mm_load_si128((__m128i*)_in_b);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm_subs_epi16(c, c_sr);
|
real = _mm_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
|
|
||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
||||||
|
|
||||||
realcacc = _mm_adds_epi16(realcacc, real);
|
realcacc = _mm_adds_epi16(realcacc, real);
|
||||||
imagcacc = _mm_adds_epi16(imagcacc, imag);
|
imagcacc = _mm_adds_epi16(imagcacc, imag);
|
||||||
@ -136,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
|
|||||||
|
|
||||||
a = _mm_or_si128(realcacc, imagcacc);
|
a = _mm_or_si128(realcacc, imagcacc);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
|
|
||||||
for (number = 0; number < 4; ++number)
|
for (number = 0; number < 4; ++number)
|
||||||
{
|
{
|
||||||
@ -174,7 +175,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
|
|||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
|
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
realcacc = _mm_setzero_si128();
|
realcacc = _mm_setzero_si128();
|
||||||
imagcacc = _mm_setzero_si128();
|
imagcacc = _mm_setzero_si128();
|
||||||
@ -182,27 +184,27 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
|
|||||||
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
|
||||||
b = _mm_loadu_si128((__m128i*)_in_b);
|
b = _mm_loadu_si128((__m128i*)_in_b);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm_subs_epi16(c, c_sr);
|
real = _mm_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
|
|
||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
||||||
|
|
||||||
realcacc = _mm_adds_epi16(realcacc, real);
|
realcacc = _mm_adds_epi16(realcacc, real);
|
||||||
imagcacc = _mm_adds_epi16(imagcacc, imag);
|
imagcacc = _mm_adds_epi16(imagcacc, imag);
|
||||||
@ -216,7 +218,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
|
|||||||
|
|
||||||
result = _mm_or_si128(realcacc, imagcacc);
|
result = _mm_or_si128(realcacc, imagcacc);
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
|
_mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
|
||||||
|
|
||||||
for (i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
@ -253,7 +255,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
|
|||||||
if (avx_iters > 0)
|
if (avx_iters > 0)
|
||||||
{
|
{
|
||||||
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
|
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_16sc_t dotProductVector[8];
|
||||||
|
|
||||||
realcacc = _mm256_setzero_si256();
|
realcacc = _mm256_setzero_si256();
|
||||||
imagcacc = _mm256_setzero_si256();
|
imagcacc = _mm256_setzero_si256();
|
||||||
@ -261,7 +264,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
|
|||||||
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(number = 0; number < avx_iters; number++)
|
for (number = 0; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm256_loadu_si256((__m256i*)_in_a);
|
a = _mm256_loadu_si256((__m256i*)_in_a);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_a + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_a + 16);
|
||||||
@ -269,7 +272,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
|
|||||||
__VOLK_GNSSSDR_PREFETCH(_in_b + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_b + 16);
|
||||||
c = _mm256_mullo_epi16(a, b);
|
c = _mm256_mullo_epi16(a, b);
|
||||||
|
|
||||||
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm256_subs_epi16(c, c_sr);
|
real = _mm256_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
b_sl = _mm256_slli_si256(b, 2);
|
b_sl = _mm256_slli_si256(b, 2);
|
||||||
@ -278,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
|
|||||||
imag1 = _mm256_mullo_epi16(a, b_sl);
|
imag1 = _mm256_mullo_epi16(a, b_sl);
|
||||||
imag2 = _mm256_mullo_epi16(b, a_sl);
|
imag2 = _mm256_mullo_epi16(b, a_sl);
|
||||||
|
|
||||||
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
||||||
|
|
||||||
realcacc = _mm256_adds_epi16(realcacc, real);
|
realcacc = _mm256_adds_epi16(realcacc, real);
|
||||||
imagcacc = _mm256_adds_epi16(imagcacc, imag);
|
imagcacc = _mm256_adds_epi16(imagcacc, imag);
|
||||||
@ -292,7 +295,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
|
|||||||
|
|
||||||
result = _mm256_or_si256(realcacc, imagcacc);
|
result = _mm256_or_si256(realcacc, imagcacc);
|
||||||
|
|
||||||
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
|
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
|
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
@ -330,7 +333,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
|
|||||||
if (avx_iters > 0)
|
if (avx_iters > 0)
|
||||||
{
|
{
|
||||||
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
|
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_16sc_t dotProductVector[8];
|
||||||
|
|
||||||
realcacc = _mm256_setzero_si256();
|
realcacc = _mm256_setzero_si256();
|
||||||
imagcacc = _mm256_setzero_si256();
|
imagcacc = _mm256_setzero_si256();
|
||||||
@ -338,7 +342,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
|
|||||||
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(number = 0; number < avx_iters; number++)
|
for (number = 0; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
a = _mm256_load_si256((__m256i*)_in_a);
|
a = _mm256_load_si256((__m256i*)_in_a);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_a + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_a + 16);
|
||||||
@ -346,7 +350,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
|
|||||||
__VOLK_GNSSSDR_PREFETCH(_in_b + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_b + 16);
|
||||||
c = _mm256_mullo_epi16(a, b);
|
c = _mm256_mullo_epi16(a, b);
|
||||||
|
|
||||||
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm256_subs_epi16(c, c_sr);
|
real = _mm256_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
b_sl = _mm256_slli_si256(b, 2);
|
b_sl = _mm256_slli_si256(b, 2);
|
||||||
@ -355,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
|
|||||||
imag1 = _mm256_mullo_epi16(a, b_sl);
|
imag1 = _mm256_mullo_epi16(a, b_sl);
|
||||||
imag2 = _mm256_mullo_epi16(b, a_sl);
|
imag2 = _mm256_mullo_epi16(b, a_sl);
|
||||||
|
|
||||||
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
|
||||||
|
|
||||||
realcacc = _mm256_adds_epi16(realcacc, real);
|
realcacc = _mm256_adds_epi16(realcacc, real);
|
||||||
imagcacc = _mm256_adds_epi16(imagcacc, imag);
|
imagcacc = _mm256_adds_epi16(imagcacc, imag);
|
||||||
@ -369,7 +373,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
|
|||||||
|
|
||||||
result = _mm256_or_si256(realcacc, imagcacc);
|
result = _mm256_or_si256(realcacc, imagcacc);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
|
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
|
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
@ -397,8 +401,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
|
|||||||
unsigned int quarter_points = num_points / 4;
|
unsigned int quarter_points = num_points / 4;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
|
lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
|
||||||
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
|
lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
|
||||||
*out = lv_cmake((int16_t)0, (int16_t)0);
|
*out = lv_cmake((int16_t)0, (int16_t)0);
|
||||||
|
|
||||||
if (quarter_points > 0)
|
if (quarter_points > 0)
|
||||||
@ -407,15 +411,16 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
|
|||||||
// 2nd lane holds the imaginary part
|
// 2nd lane holds the imaginary part
|
||||||
int16x4x2_t a_val, b_val, c_val, accumulator;
|
int16x4x2_t a_val, b_val, c_val, accumulator;
|
||||||
int16x4x2_t tmp_real, tmp_imag;
|
int16x4x2_t tmp_real, tmp_imag;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t accum_result[4];
|
||||||
accumulator.val[0] = vdup_n_s16(0);
|
accumulator.val[0] = vdup_n_s16(0);
|
||||||
accumulator.val[1] = vdup_n_s16(0);
|
accumulator.val[1] = vdup_n_s16(0);
|
||||||
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
|
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
|
||||||
|
|
||||||
for(number = 0; number < quarter_points; ++number)
|
for (number = 0; number < quarter_points; ++number)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
|
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
|
||||||
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
|
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
|
||||||
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
|
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
|
||||||
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
|
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
|
||||||
|
|
||||||
@ -451,7 +456,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
// tail case
|
// tail case
|
||||||
for(number = quarter_points * 4; number < num_points; ++number)
|
for (number = quarter_points * 4; number < num_points; ++number)
|
||||||
{
|
{
|
||||||
*out += (*a_ptr++) * (*b_ptr++);
|
*out += (*a_ptr++) * (*b_ptr++);
|
||||||
}
|
}
|
||||||
@ -468,20 +473,21 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c
|
|||||||
unsigned int quarter_points = num_points / 4;
|
unsigned int quarter_points = num_points / 4;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
|
lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
|
||||||
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
|
lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
|
||||||
// for 2-lane vectors, 1st lane holds the real part,
|
// for 2-lane vectors, 1st lane holds the real part,
|
||||||
// 2nd lane holds the imaginary part
|
// 2nd lane holds the imaginary part
|
||||||
int16x4x2_t a_val, b_val, accumulator;
|
int16x4x2_t a_val, b_val, accumulator;
|
||||||
int16x4x2_t tmp;
|
int16x4x2_t tmp;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t accum_result[4];
|
||||||
accumulator.val[0] = vdup_n_s16(0);
|
accumulator.val[0] = vdup_n_s16(0);
|
||||||
accumulator.val[1] = vdup_n_s16(0);
|
accumulator.val[1] = vdup_n_s16(0);
|
||||||
|
|
||||||
for(number = 0; number < quarter_points; ++number)
|
for (number = 0; number < quarter_points; ++number)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
|
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
|
||||||
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
|
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
|
||||||
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
|
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
|
||||||
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
|
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
|
||||||
|
|
||||||
@ -503,7 +509,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c
|
|||||||
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
|
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
|
||||||
|
|
||||||
// tail case
|
// tail case
|
||||||
for(number = quarter_points * 4; number < num_points; ++number)
|
for (number = quarter_points * 4; number < num_points; ++number)
|
||||||
{
|
{
|
||||||
*out += (*a_ptr++) * (*b_ptr++);
|
*out += (*a_ptr++) * (*b_ptr++);
|
||||||
}
|
}
|
||||||
@ -520,22 +526,23 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
|
|||||||
unsigned int quarter_points = num_points / 4;
|
unsigned int quarter_points = num_points / 4;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
|
lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
|
||||||
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
|
lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
|
||||||
// for 2-lane vectors, 1st lane holds the real part,
|
// for 2-lane vectors, 1st lane holds the real part,
|
||||||
// 2nd lane holds the imaginary part
|
// 2nd lane holds the imaginary part
|
||||||
int16x4x2_t a_val, b_val, accumulator1, accumulator2;
|
int16x4x2_t a_val, b_val, accumulator1, accumulator2;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t accum_result[4];
|
||||||
accumulator1.val[0] = vdup_n_s16(0);
|
accumulator1.val[0] = vdup_n_s16(0);
|
||||||
accumulator1.val[1] = vdup_n_s16(0);
|
accumulator1.val[1] = vdup_n_s16(0);
|
||||||
accumulator2.val[0] = vdup_n_s16(0);
|
accumulator2.val[0] = vdup_n_s16(0);
|
||||||
accumulator2.val[1] = vdup_n_s16(0);
|
accumulator2.val[1] = vdup_n_s16(0);
|
||||||
|
|
||||||
for(number = 0; number < quarter_points; ++number)
|
for (number = 0; number < quarter_points; ++number)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
|
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
|
||||||
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
|
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
|
||||||
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
|
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
|
||||||
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
|
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
|
||||||
|
|
||||||
@ -556,7 +563,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
|
|||||||
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
|
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
|
||||||
|
|
||||||
// tail case
|
// tail case
|
||||||
for(number = quarter_points * 4; number < num_points; ++number)
|
for (number = quarter_points * 4; number < num_points; ++number)
|
||||||
{
|
{
|
||||||
*out += (*a_ptr++) * (*b_ptr++);
|
*out += (*a_ptr++) * (*b_ptr++);
|
||||||
}
|
}
|
||||||
|
@ -74,7 +74,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
result[n_vec] = lv_cmake(0,0);
|
result[n_vec] = lv_cmake(0, 0);
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
//r*a.r - i*a.i, i*a.r + r*a.i
|
//r*a.r - i*a.i, i*a.r + r*a.i
|
||||||
@ -96,11 +96,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
result[n_vec] = lv_cmake(0,0);
|
result[n_vec] = lv_cmake(0, 0);
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))),
|
lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), -sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))),
|
||||||
sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n]))));
|
sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n]))));
|
||||||
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -112,9 +112,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
|
|||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_16sc_t dotProduct = lv_cmake(0,0);
|
lv_16sc_t dotProduct = lv_cmake(0, 0);
|
||||||
int n_vec;
|
int n_vec;
|
||||||
unsigned int index;
|
unsigned int index;
|
||||||
const unsigned int sse_iters = num_points / 4;
|
const unsigned int sse_iters = num_points / 4;
|
||||||
@ -125,7 +125,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
|
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
|
||||||
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
|
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
|
||||||
@ -141,25 +142,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(index = 0; index < sse_iters; index++)
|
for (index = 0; index < sse_iters; index++)
|
||||||
{
|
{
|
||||||
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm_subs_epi16(c, c_sr);
|
real = _mm_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(c, imag);
|
imag = _mm_adds_epi16(c, imag);
|
||||||
|
|
||||||
@ -176,12 +177,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (index = 0; index < 4; ++index)
|
for (index = 0; index < 4; ++index)
|
||||||
{
|
{
|
||||||
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
||||||
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
||||||
}
|
}
|
||||||
_out[n_vec] = dotProduct;
|
_out[n_vec] = dotProduct;
|
||||||
}
|
}
|
||||||
@ -191,12 +192,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
for(index = sse_iters * 4; index < num_points; index++)
|
for (index = sse_iters * 4; index < num_points; index++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
||||||
|
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,9 +207,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
|
|||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_16sc_t dotProduct = lv_cmake(0,0);
|
lv_16sc_t dotProduct = lv_cmake(0, 0);
|
||||||
int n_vec;
|
int n_vec;
|
||||||
unsigned int index;
|
unsigned int index;
|
||||||
const unsigned int sse_iters = num_points / 4;
|
const unsigned int sse_iters = num_points / 4;
|
||||||
@ -219,7 +220,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
|
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
|
||||||
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
|
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
|
||||||
@ -235,25 +237,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(index = 0; index < sse_iters; index++)
|
for (index = 0; index < sse_iters; index++)
|
||||||
{
|
{
|
||||||
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
|
|
||||||
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm_subs_epi16(c, c_sr);
|
real = _mm_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(c, imag);
|
imag = _mm_adds_epi16(c, imag);
|
||||||
|
|
||||||
@ -270,12 +272,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (index = 0; index < 4; ++index)
|
for (index = 0; index < 4; ++index)
|
||||||
{
|
{
|
||||||
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
||||||
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
||||||
}
|
}
|
||||||
_out[n_vec] = dotProduct;
|
_out[n_vec] = dotProduct;
|
||||||
}
|
}
|
||||||
@ -285,12 +287,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
for(index = sse_iters * 4; index < num_points; index++)
|
for (index = sse_iters * 4; index < num_points; index++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
||||||
|
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -300,9 +302,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
|
|||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_16sc_t dotProduct = lv_cmake(0,0);
|
lv_16sc_t dotProduct = lv_cmake(0, 0);
|
||||||
int n_vec;
|
int n_vec;
|
||||||
unsigned int index;
|
unsigned int index;
|
||||||
const unsigned int sse_iters = num_points / 8;
|
const unsigned int sse_iters = num_points / 8;
|
||||||
@ -313,7 +315,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
|
|||||||
|
|
||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_16sc_t dotProductVector[8];
|
||||||
|
|
||||||
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
|
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
|
||||||
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
|
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
|
||||||
@ -329,24 +332,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
|
|||||||
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(index = 0; index < sse_iters; index++)
|
for (index = 0; index < sse_iters; index++)
|
||||||
{
|
{
|
||||||
b = _mm256_load_si256((__m256i*)_in_common);
|
b = _mm256_load_si256((__m256i*)_in_common);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index*8]));
|
a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index * 8]));
|
||||||
|
|
||||||
c = _mm256_mullo_epi16(a, b);
|
c = _mm256_mullo_epi16(a, b);
|
||||||
|
|
||||||
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm256_subs_epi16(c, c_sr);
|
real = _mm256_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
|
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
|
||||||
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
|
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
|
||||||
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm256_adds_epi16(c, imag);
|
imag = _mm256_adds_epi16(c, imag);
|
||||||
|
|
||||||
@ -363,12 +366,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
|
|||||||
|
|
||||||
a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]);
|
a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
|
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (index = 0; index < 8; ++index)
|
for (index = 0; index < 8; ++index)
|
||||||
{
|
{
|
||||||
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
||||||
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
||||||
}
|
}
|
||||||
_out[n_vec] = dotProduct;
|
_out[n_vec] = dotProduct;
|
||||||
}
|
}
|
||||||
@ -379,12 +382,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
for(index = sse_iters * 8; index < num_points; index++)
|
for (index = sse_iters * 8; index < num_points; index++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
||||||
|
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -394,9 +397,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
|
|||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_16sc_t dotProduct = lv_cmake(0,0);
|
lv_16sc_t dotProduct = lv_cmake(0, 0);
|
||||||
|
|
||||||
const unsigned int sse_iters = num_points / 8;
|
const unsigned int sse_iters = num_points / 8;
|
||||||
int n_vec;
|
int n_vec;
|
||||||
@ -407,7 +410,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
|
|||||||
|
|
||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_16sc_t dotProductVector[8];
|
||||||
|
|
||||||
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
|
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
|
||||||
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
|
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
|
||||||
@ -423,24 +427,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
|
|||||||
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(index = 0; index < sse_iters; index++)
|
for (index = 0; index < sse_iters; index++)
|
||||||
{
|
{
|
||||||
b = _mm256_loadu_si256((__m256i*)_in_common);
|
b = _mm256_loadu_si256((__m256i*)_in_common);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index*8]));
|
a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index * 8]));
|
||||||
|
|
||||||
c = _mm256_mullo_epi16(a, b);
|
c = _mm256_mullo_epi16(a, b);
|
||||||
|
|
||||||
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm256_subs_epi16(c, c_sr);
|
real = _mm256_subs_epi16(c, c_sr);
|
||||||
|
|
||||||
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
|
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
|
||||||
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
|
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
|
||||||
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm256_adds_epi16(c, imag);
|
imag = _mm256_adds_epi16(c, imag);
|
||||||
|
|
||||||
@ -457,12 +461,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
|
|||||||
|
|
||||||
a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]);
|
a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
|
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (index = 0; index < 8; ++index)
|
for (index = 0; index < 8; ++index)
|
||||||
{
|
{
|
||||||
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
||||||
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
||||||
}
|
}
|
||||||
_out[n_vec] = dotProduct;
|
_out[n_vec] = dotProduct;
|
||||||
}
|
}
|
||||||
@ -473,12 +477,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
for(index = sse_iters * 8; index < num_points; index++)
|
for (index = sse_iters * 8; index < num_points; index++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
||||||
|
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -488,9 +492,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
|
|||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_16sc_t dotProduct = lv_cmake(0,0);
|
lv_16sc_t dotProduct = lv_cmake(0, 0);
|
||||||
int n_vec;
|
int n_vec;
|
||||||
unsigned int index;
|
unsigned int index;
|
||||||
const unsigned int neon_iters = num_points / 4;
|
const unsigned int neon_iters = num_points / 4;
|
||||||
@ -501,7 +505,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
|
|||||||
|
|
||||||
if (neon_iters > 0)
|
if (neon_iters > 0)
|
||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
int16x4x2_t a_val, b_val, c_val;
|
int16x4x2_t a_val, b_val, c_val;
|
||||||
|
|
||||||
@ -509,19 +514,19 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
|
|||||||
|
|
||||||
int16x4x2_t tmp_real, tmp_imag;
|
int16x4x2_t tmp_real, tmp_imag;
|
||||||
|
|
||||||
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
accumulator[n_vec].val[0] = vdup_n_s16(0);
|
accumulator[n_vec].val[0] = vdup_n_s16(0);
|
||||||
accumulator[n_vec].val[1] = vdup_n_s16(0);
|
accumulator[n_vec].val[1] = vdup_n_s16(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(index = 0; index < neon_iters; index++)
|
for (index = 0; index < neon_iters; index++)
|
||||||
{
|
{
|
||||||
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
//__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8);
|
//__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8);
|
||||||
|
|
||||||
// multiply the real*real and imag*imag to get real result
|
// multiply the real*real and imag*imag to get real result
|
||||||
@ -547,12 +552,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
|
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (index = 0; index < 4; ++index)
|
for (index = 0; index < 4; ++index)
|
||||||
{
|
{
|
||||||
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
||||||
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
||||||
}
|
}
|
||||||
_out[n_vec] = dotProduct;
|
_out[n_vec] = dotProduct;
|
||||||
}
|
}
|
||||||
@ -561,12 +566,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
for(index = neon_iters * 4; index < num_points; index++)
|
for (index = neon_iters * 4; index < num_points; index++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
||||||
|
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -576,9 +581,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
|
|||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_16sc_t dotProduct = lv_cmake(0,0);
|
lv_16sc_t dotProduct = lv_cmake(0, 0);
|
||||||
|
|
||||||
const unsigned int neon_iters = num_points / 4;
|
const unsigned int neon_iters = num_points / 4;
|
||||||
int n_vec;
|
int n_vec;
|
||||||
@ -589,25 +594,26 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
|
|||||||
|
|
||||||
if (neon_iters > 0)
|
if (neon_iters > 0)
|
||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
int16x4x2_t a_val, b_val, tmp;
|
int16x4x2_t a_val, b_val, tmp;
|
||||||
|
|
||||||
int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
|
int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
accumulator[n_vec].val[0] = vdup_n_s16(0);
|
accumulator[n_vec].val[0] = vdup_n_s16(0);
|
||||||
accumulator[n_vec].val[1] = vdup_n_s16(0);
|
accumulator[n_vec].val[1] = vdup_n_s16(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(index = 0; index < neon_iters; index++)
|
for (index = 0; index < neon_iters; index++)
|
||||||
{
|
{
|
||||||
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4]));
|
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4]));
|
||||||
|
|
||||||
tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
|
tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
|
||||||
tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
|
tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
|
||||||
@ -624,12 +630,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
|
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (index = 0; index < 4; ++index)
|
for (index = 0; index < 4; ++index)
|
||||||
{
|
{
|
||||||
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
||||||
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
||||||
}
|
}
|
||||||
_out[n_vec] = dotProduct;
|
_out[n_vec] = dotProduct;
|
||||||
}
|
}
|
||||||
@ -638,12 +644,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
for(index = neon_iters * 4; index < num_points; index++)
|
for (index = neon_iters * 4; index < num_points; index++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
||||||
|
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -653,9 +659,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
|
|||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_16sc_t dotProduct = lv_cmake(0,0);
|
lv_16sc_t dotProduct = lv_cmake(0, 0);
|
||||||
|
|
||||||
const unsigned int neon_iters = num_points / 4;
|
const unsigned int neon_iters = num_points / 4;
|
||||||
int n_vec;
|
int n_vec;
|
||||||
@ -666,14 +672,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
|
|||||||
|
|
||||||
if (neon_iters > 0)
|
if (neon_iters > 0)
|
||||||
{
|
{
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_16sc_t dotProductVector[4];
|
||||||
|
|
||||||
int16x4x2_t a_val, b_val;
|
int16x4x2_t a_val, b_val;
|
||||||
|
|
||||||
int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
|
int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
|
||||||
int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
|
int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
accumulator1[n_vec].val[0] = vdup_n_s16(0);
|
accumulator1[n_vec].val[0] = vdup_n_s16(0);
|
||||||
accumulator1[n_vec].val[1] = vdup_n_s16(0);
|
accumulator1[n_vec].val[1] = vdup_n_s16(0);
|
||||||
@ -681,13 +688,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
|
|||||||
accumulator2[n_vec].val[1] = vdup_n_s16(0);
|
accumulator2[n_vec].val[1] = vdup_n_s16(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(index = 0; index < neon_iters; index++)
|
for (index = 0; index < neon_iters; index++)
|
||||||
{
|
{
|
||||||
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4]));
|
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4]));
|
||||||
|
|
||||||
accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]);
|
accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]);
|
||||||
accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]);
|
accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]);
|
||||||
@ -705,12 +712,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
|
vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (index = 0; index < 4; ++index)
|
for (index = 0; index < 4; ++index)
|
||||||
{
|
{
|
||||||
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
|
||||||
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
|
||||||
}
|
}
|
||||||
_out[n_vec] = dotProduct;
|
_out[n_vec] = dotProduct;
|
||||||
}
|
}
|
||||||
@ -720,12 +727,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
for(index = neon_iters * 4; index < num_points; index++)
|
for (index = neon_iters * 4; index < num_points; index++)
|
||||||
{
|
{
|
||||||
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
|
||||||
|
|
||||||
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
|
||||||
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -47,22 +47,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t*
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* Generic */
|
#endif /* Generic */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
@ -71,22 +71,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic_sat(lv_16sc
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* Generic */
|
#endif /* Generic */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
@ -95,18 +95,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,18 +120,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -145,18 +145,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* r
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -170,18 +170,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* r
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,22 +195,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // NEON
|
#endif // NEON
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
@ -220,22 +220,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t*
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // NEON
|
#endif // NEON
|
||||||
|
|
||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
|
|
||||||
@ -244,23 +244,21 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc
|
|||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // NEON
|
#endif // NEON
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H
|
#endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H
|
||||||
|
|
||||||
|
|
||||||
|
@ -91,29 +91,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con
|
|||||||
const lv_16sc_t* _in_a = in_a;
|
const lv_16sc_t* _in_a = in_a;
|
||||||
const lv_16sc_t* _in_b = in_b;
|
const lv_16sc_t* _in_b = in_b;
|
||||||
lv_16sc_t* _out = out;
|
lv_16sc_t* _out = out;
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
b = _mm_load_si128((__m128i*)_in_b);
|
b = _mm_load_si128((__m128i*)_in_b);
|
||||||
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm_subs_epi16 (c, c_sr);
|
real = _mm_subs_epi16(c, c_sr);
|
||||||
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
||||||
|
|
||||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
|
|
||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2);
|
imag = _mm_adds_epi16(imag1, imag2);
|
||||||
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
||||||
|
|
||||||
result = _mm_or_si128 (real, imag);
|
result = _mm_or_si128(real, imag);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)_out, result);
|
_mm_store_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
@ -137,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
|
|||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 4;
|
const unsigned int sse_iters = num_points / 4;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
|
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
|
||||||
|
|
||||||
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
@ -145,29 +145,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
|
|||||||
const lv_16sc_t* _in_a = in_a;
|
const lv_16sc_t* _in_a = in_a;
|
||||||
const lv_16sc_t* _in_b = in_b;
|
const lv_16sc_t* _in_b = in_b;
|
||||||
lv_16sc_t* _out = out;
|
lv_16sc_t* _out = out;
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
|
||||||
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
|
||||||
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
|
||||||
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
|
||||||
b = _mm_loadu_si128((__m128i*)_in_b);
|
b = _mm_loadu_si128((__m128i*)_in_b);
|
||||||
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
|
||||||
|
|
||||||
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm_subs_epi16 (c, c_sr);
|
real = _mm_subs_epi16(c, c_sr);
|
||||||
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
||||||
|
|
||||||
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
|
||||||
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
|
||||||
|
|
||||||
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm_adds_epi16(imag1, imag2);
|
imag = _mm_adds_epi16(imag1, imag2);
|
||||||
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
||||||
|
|
||||||
result = _mm_or_si128 (real, imag);
|
result = _mm_or_si128(real, imag);
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)_out, result);
|
_mm_storeu_si128((__m128i*)_out, result);
|
||||||
|
|
||||||
@ -196,29 +196,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
|
|||||||
const lv_16sc_t* _in_b = in_b;
|
const lv_16sc_t* _in_b = in_b;
|
||||||
lv_16sc_t* _out = out;
|
lv_16sc_t* _out = out;
|
||||||
|
|
||||||
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
|
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
|
||||||
|
|
||||||
const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(;number < avx2_points; number++)
|
for (; number < avx2_points; number++)
|
||||||
{
|
{
|
||||||
a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||||
b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
|
b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||||
c = _mm256_mullo_epi16(a, b);
|
c = _mm256_mullo_epi16(a, b);
|
||||||
|
|
||||||
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm256_subs_epi16(c, c_sr);
|
real = _mm256_subs_epi16(c, c_sr);
|
||||||
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
||||||
|
|
||||||
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
|
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
|
||||||
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
|
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
|
||||||
|
|
||||||
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm256_adds_epi16(imag1, imag2);
|
imag = _mm256_adds_epi16(imag1, imag2);
|
||||||
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
||||||
|
|
||||||
result = _mm256_or_si256(real, imag);
|
result = _mm256_or_si256(real, imag);
|
||||||
|
|
||||||
@ -230,7 +230,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
|
|||||||
}
|
}
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
number = avx2_points * 8;
|
number = avx2_points * 8;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*_out++ = (*_in_a++) * (*_in_b++);
|
*_out++ = (*_in_a++) * (*_in_b++);
|
||||||
}
|
}
|
||||||
@ -250,29 +250,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
|
|||||||
const lv_16sc_t* _in_b = in_b;
|
const lv_16sc_t* _in_b = in_b;
|
||||||
lv_16sc_t* _out = out;
|
lv_16sc_t* _out = out;
|
||||||
|
|
||||||
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
|
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
|
||||||
|
|
||||||
const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
|
||||||
const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
|
||||||
|
|
||||||
for(;number < avx2_points; number++)
|
for (; number < avx2_points; number++)
|
||||||
{
|
{
|
||||||
a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
|
||||||
b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
|
b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
|
||||||
c = _mm256_mullo_epi16(a, b);
|
c = _mm256_mullo_epi16(a, b);
|
||||||
|
|
||||||
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
|
||||||
real = _mm256_subs_epi16(c, c_sr);
|
real = _mm256_subs_epi16(c, c_sr);
|
||||||
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
|
||||||
|
|
||||||
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
|
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
|
||||||
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
|
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
|
||||||
|
|
||||||
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
|
||||||
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
|
||||||
|
|
||||||
imag = _mm256_adds_epi16(imag1, imag2);
|
imag = _mm256_adds_epi16(imag1, imag2);
|
||||||
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
|
||||||
|
|
||||||
result = _mm256_or_si256(real, imag);
|
result = _mm256_or_si256(real, imag);
|
||||||
|
|
||||||
@ -284,7 +284,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
|
|||||||
}
|
}
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
number = avx2_points * 8;
|
number = avx2_points * 8;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*_out++ = (*_in_a++) * (*_in_b++);
|
*_out++ = (*_in_a++) * (*_in_b++);
|
||||||
}
|
}
|
||||||
@ -292,23 +292,22 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
|
|||||||
#endif /* LV_HAVE_AVX2 */
|
#endif /* LV_HAVE_AVX2 */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_16sc_t *a_ptr = (lv_16sc_t*) in_a;
|
lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
|
||||||
lv_16sc_t *b_ptr = (lv_16sc_t*) in_b;
|
lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
|
||||||
unsigned int quarter_points = num_points / 4;
|
unsigned int quarter_points = num_points / 4;
|
||||||
int16x4x2_t a_val, b_val, c_val;
|
int16x4x2_t a_val, b_val, c_val;
|
||||||
int16x4x2_t tmp_real, tmp_imag;
|
int16x4x2_t tmp_real, tmp_imag;
|
||||||
unsigned int number = 0;
|
unsigned int number = 0;
|
||||||
|
|
||||||
for(number = 0; number < quarter_points; ++number)
|
for (number = 0; number < quarter_points; ++number)
|
||||||
{
|
{
|
||||||
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
|
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
|
||||||
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
|
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
|
||||||
__VOLK_GNSSSDR_PREFETCH(a_ptr + 4);
|
__VOLK_GNSSSDR_PREFETCH(a_ptr + 4);
|
||||||
__VOLK_GNSSSDR_PREFETCH(b_ptr + 4);
|
__VOLK_GNSSSDR_PREFETCH(b_ptr + 4);
|
||||||
|
|
||||||
@ -334,7 +333,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const
|
|||||||
out += 4;
|
out += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = quarter_points * 4; number < num_points; number++)
|
for (number = quarter_points * 4; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*out++ = (*a_ptr++) * (*b_ptr++);
|
*out++ = (*a_ptr++) * (*b_ptr++);
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -41,7 +41,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.345;
|
float rem_carrier_phase_in_rad = 0.345;
|
||||||
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.345;
|
float rem_carrier_phase_in_rad = 0.345;
|
||||||
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_rel
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -113,22 +113,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SSE3
|
#endif // SSE3
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
@ -144,22 +144,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3_relo
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SSE3
|
#endif // SSE3
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // SSE3
|
#endif // SSE3
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
@ -206,22 +206,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2(lv_1
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
@ -237,22 +237,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2_relo
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
@ -268,22 +268,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2(lv_1
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
@ -299,22 +299,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_relo
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // AVX2
|
#endif // AVX2
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
@ -330,22 +330,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16s
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // NEON
|
#endif // NEON
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
@ -361,23 +361,21 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(in_a);
|
volk_gnsssdr_free(in_a);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // NEON
|
#endif // NEON
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H
|
#endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H
|
||||||
|
|
||||||
|
|
||||||
|
@ -106,7 +106,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -120,7 +121,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -138,13 +139,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -156,7 +157,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE4_1
|
#ifdef LV_HAVE_SSE4_1
|
||||||
@ -172,7 +173,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -186,7 +188,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -204,13 +206,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -239,7 +241,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -253,7 +256,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -274,13 +277,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -309,7 +312,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -323,7 +327,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -344,13 +348,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -378,7 +382,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -393,7 +398,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -411,13 +416,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -427,7 +432,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -455,7 +460,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -470,7 +476,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -488,13 +494,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -504,7 +510,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
@ -530,7 +536,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
||||||
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int32_t local_code_chip_index[4];
|
||||||
int32_t local_code_chip_index_;
|
int32_t local_code_chip_index_;
|
||||||
|
|
||||||
const int32x4_t zeros = vdupq_n_s32(0);
|
const int32x4_t zeros = vdupq_n_s32(0);
|
||||||
@ -538,11 +545,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
||||||
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
||||||
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
||||||
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
uint32x4_t igx;
|
uint32x4_t igx;
|
||||||
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
||||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
||||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
||||||
float32x4_t n0 = vld1q_f32((float*)vec);
|
float32x4_t n0 = vld1q_f32((float*)vec);
|
||||||
int current_correlator_tap;
|
int current_correlator_tap;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
@ -552,7 +560,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < neon_iters; n++)
|
for (n = 0; n < neon_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
|
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
|
||||||
@ -568,7 +576,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
|
|
||||||
// fmod
|
// fmod
|
||||||
c = vmulq_f32(aux, reciprocal);
|
c = vmulq_f32(aux, reciprocal);
|
||||||
i = vcvtq_s32_f32(c);
|
i = vcvtq_s32_f32(c);
|
||||||
cTrunc = vcvtq_f32_s32(i);
|
cTrunc = vcvtq_f32_s32(i);
|
||||||
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
||||||
aux = vsubq_f32(aux, base);
|
aux = vsubq_f32(aux, base);
|
||||||
@ -580,13 +588,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
|
|
||||||
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
|
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = vaddq_f32(indexn, fours);
|
indexn = vaddq_f32(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = neon_iters * 4; n < num_points; n++)
|
for (n = neon_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
@ -604,4 +612,3 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
|
|||||||
|
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/
|
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/
|
||||||
|
|
||||||
|
@ -95,69 +95,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t
|
|||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||||
{
|
{
|
||||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
|
|
||||||
lv_16sc_t** _result = result;
|
lv_16sc_t** _result = result;
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
float tmp_rem_code_phase_chips;
|
float tmp_rem_code_phase_chips;
|
||||||
__m128 _rem_code_phase,_code_phase_step_chips;
|
__m128 _rem_code_phase, _code_phase_step_chips;
|
||||||
__m128i _code_length_chips,_code_length_chips_minus1;
|
__m128i _code_length_chips, _code_length_chips_minus1;
|
||||||
__m128 _code_phase_out,_code_phase_out_with_offset;
|
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||||
|
|
||||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips[4];
|
||||||
four_times_code_length_chips[0] = code_length_chips;
|
four_times_code_length_chips[0] = code_length_chips;
|
||||||
four_times_code_length_chips[1] = code_length_chips;
|
four_times_code_length_chips[1] = code_length_chips;
|
||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
||||||
|
|
||||||
__m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over;
|
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
|
|
||||||
__m128i zero = _mm_setzero_si128();
|
__m128i zero = _mm_setzero_si128();
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
__m128 _4output_index = _mm_load_ps(init_idx_float);
|
__m128 _4output_index = _mm_load_ps(init_idx_float);
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
|
||||||
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
|
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
|
||||||
|
|
||||||
int current_vector = 0;
|
int current_vector = 0;
|
||||||
int sample_idx = 0;
|
int sample_idx = 0;
|
||||||
for(number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
//common to all outputs
|
//common to all outputs
|
||||||
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
||||||
|
|
||||||
//output vector dependant (different code phase offset)
|
//output vector dependant (different code phase offset)
|
||||||
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
||||||
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
|
||||||
|
|
||||||
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
||||||
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
||||||
|
|
||||||
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
||||||
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
|
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
||||||
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
|
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
//todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
||||||
@ -169,9 +174,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
|
|||||||
sample_idx += 4;
|
sample_idx += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = quarterPoints * 4; number < num_output_samples; number++)
|
for (number = quarterPoints * 4; number < num_output_samples; number++)
|
||||||
{
|
{
|
||||||
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
|
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
|
||||||
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
||||||
@ -186,69 +191,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
|
|||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||||
{
|
{
|
||||||
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
|
|
||||||
lv_16sc_t** _result = result;
|
lv_16sc_t** _result = result;
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
float tmp_rem_code_phase_chips;
|
float tmp_rem_code_phase_chips;
|
||||||
__m128 _rem_code_phase,_code_phase_step_chips;
|
__m128 _rem_code_phase, _code_phase_step_chips;
|
||||||
__m128i _code_length_chips,_code_length_chips_minus1;
|
__m128i _code_length_chips, _code_length_chips_minus1;
|
||||||
__m128 _code_phase_out,_code_phase_out_with_offset;
|
__m128 _code_phase_out, _code_phase_out_with_offset;
|
||||||
|
|
||||||
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips[4];
|
||||||
four_times_code_length_chips[0] = code_length_chips;
|
four_times_code_length_chips[0] = code_length_chips;
|
||||||
four_times_code_length_chips[1] = code_length_chips;
|
four_times_code_length_chips[1] = code_length_chips;
|
||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
|
||||||
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
|
||||||
|
|
||||||
__m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over;
|
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
|
|
||||||
__m128i zero = _mm_setzero_si128();
|
__m128i zero = _mm_setzero_si128();
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
__m128 _4output_index = _mm_loadu_ps(init_idx_float);
|
__m128 _4output_index = _mm_loadu_ps(init_idx_float);
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
|
||||||
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
|
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
|
||||||
|
|
||||||
int current_vector = 0;
|
int current_vector = 0;
|
||||||
int sample_idx = 0;
|
int sample_idx = 0;
|
||||||
for(number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
//common to all outputs
|
//common to all outputs
|
||||||
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
||||||
|
|
||||||
//output vector dependant (different code phase offset)
|
//output vector dependant (different code phase offset)
|
||||||
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
||||||
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
|
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
|
||||||
|
|
||||||
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
|
||||||
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
||||||
|
|
||||||
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
|
||||||
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
|
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
||||||
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
|
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
//todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
||||||
@ -260,9 +270,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
|
|||||||
sample_idx += 4;
|
sample_idx += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = quarterPoints * 4; number < num_output_samples; number++)
|
for (number = quarterPoints * 4; number < num_output_samples; number++)
|
||||||
{
|
{
|
||||||
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
|
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
|
||||||
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
||||||
@ -278,74 +288,79 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
|
|||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
|
||||||
{
|
{
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
const unsigned int quarterPoints = num_output_samples / 4;
|
const unsigned int quarterPoints = num_output_samples / 4;
|
||||||
float32x4_t half = vdupq_n_f32(0.5f);
|
float32x4_t half = vdupq_n_f32(0.5f);
|
||||||
|
|
||||||
lv_16sc_t** _result = result;
|
lv_16sc_t** _result = result;
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
float tmp_rem_code_phase_chips;
|
float tmp_rem_code_phase_chips;
|
||||||
float32x4_t _rem_code_phase, _code_phase_step_chips;
|
float32x4_t _rem_code_phase, _code_phase_step_chips;
|
||||||
int32x4_t _code_length_chips, _code_length_chips_minus1;
|
int32x4_t _code_length_chips, _code_length_chips_minus1;
|
||||||
float32x4_t _code_phase_out, _code_phase_out_with_offset;
|
float32x4_t _code_phase_out, _code_phase_out_with_offset;
|
||||||
float32x4_t sign, PlusHalf, Round;
|
float32x4_t sign, PlusHalf, Round;
|
||||||
|
|
||||||
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register
|
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips_minus1[4];
|
||||||
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
|
||||||
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int four_times_code_length_chips[4];
|
||||||
four_times_code_length_chips[0] = code_length_chips;
|
four_times_code_length_chips[0] = code_length_chips;
|
||||||
four_times_code_length_chips[1] = code_length_chips;
|
four_times_code_length_chips[1] = code_length_chips;
|
||||||
four_times_code_length_chips[2] = code_length_chips;
|
four_times_code_length_chips[2] = code_length_chips;
|
||||||
four_times_code_length_chips[3] = code_length_chips;
|
four_times_code_length_chips[3] = code_length_chips;
|
||||||
|
|
||||||
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register
|
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register
|
||||||
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register
|
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register
|
||||||
|
|
||||||
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
|
||||||
uint32x4_t negative_indexes, overflow_indexes;
|
uint32x4_t negative_indexes, overflow_indexes;
|
||||||
int32x4_t zero = vmovq_n_s32(0);
|
int32x4_t zero = vmovq_n_s32(0);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
float32x4_t _4output_index = vld1q_f32(init_idx_float);
|
float32x4_t _4output_index = vld1q_f32(init_idx_float);
|
||||||
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
|
||||||
float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
|
float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
|
||||||
|
|
||||||
int current_vector = 0;
|
int current_vector = 0;
|
||||||
int sample_idx = 0;
|
int sample_idx = 0;
|
||||||
for(number = 0; number < quarterPoints; number++)
|
for (number = 0; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
//common to all outputs
|
//common to all outputs
|
||||||
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
|
||||||
|
|
||||||
//output vector dependant (different code phase offset)
|
//output vector dependant (different code phase offset)
|
||||||
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
|
||||||
_rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register
|
_rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register
|
||||||
|
|
||||||
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
|
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
|
||||||
//_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
//_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
|
||||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
|
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
|
||||||
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
|
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
|
||||||
Round = vsubq_f32(PlusHalf, sign);
|
Round = vsubq_f32(PlusHalf, sign);
|
||||||
_code_phase_out_int = vcvtq_s32_f32(Round);
|
_code_phase_out_int = vcvtq_s32_f32(Round);
|
||||||
|
|
||||||
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
|
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
|
||||||
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int )));
|
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
|
||||||
|
|
||||||
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
|
||||||
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
|
||||||
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg )));
|
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
|
||||||
|
|
||||||
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
|
||||||
|
|
||||||
//todo: optimize the local code lookup table with intrinsics, if possible
|
//todo: optimize the local code lookup table with intrinsics, if possible
|
||||||
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
|
||||||
@ -357,9 +372,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
|
|||||||
sample_idx += 4;
|
sample_idx += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = quarterPoints * 4; number < num_output_samples; number++)
|
for (number = quarterPoints * 4; number < num_output_samples; number++)
|
||||||
{
|
{
|
||||||
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
|
||||||
{
|
{
|
||||||
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
|
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
|
||||||
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
|
||||||
|
@ -29,7 +29,6 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/*!
|
/*!
|
||||||
* \page volk_gnsssdr_32f_index_max_32u.h
|
* \page volk_gnsssdr_32f_index_max_32u.h
|
||||||
*
|
*
|
||||||
@ -63,7 +62,7 @@
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
|
static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
uint32_t number = 0;
|
uint32_t number = 0;
|
||||||
const uint32_t quarterPoints = num_points / 8;
|
const uint32_t quarterPoints = num_points / 8;
|
||||||
@ -71,7 +70,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
|
|||||||
float* inputPtr = (float*)src0;
|
float* inputPtr = (float*)src0;
|
||||||
|
|
||||||
__m256 indexIncrementValues = _mm256_set1_ps(8);
|
__m256 indexIncrementValues = _mm256_set1_ps(8);
|
||||||
__m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
|
__m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
|
||||||
|
|
||||||
float max = src0[0];
|
float max = src0[0];
|
||||||
float index = 0;
|
float index = 0;
|
||||||
@ -80,25 +79,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
|
|||||||
__m256 compareResults;
|
__m256 compareResults;
|
||||||
__m256 currentValues;
|
__m256 currentValues;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
|
float maxValuesBuffer[8];
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
float maxIndexesBuffer[8];
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
|
currentValues = _mm256_load_ps(inputPtr);
|
||||||
|
inputPtr += 8;
|
||||||
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
|
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
|
||||||
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
|
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
|
||||||
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
||||||
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
|
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the largest value from the remaining 8 points
|
// Calculate the largest value from the remaining 8 points
|
||||||
_mm256_store_ps(maxValuesBuffer, maxValues);
|
_mm256_store_ps(maxValuesBuffer, maxValues);
|
||||||
_mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
|
_mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
|
||||||
|
|
||||||
for(number = 0; number < 8; number++)
|
for (number = 0; number < 8; number++)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[number] > max)
|
if (maxValuesBuffer[number] > max)
|
||||||
{
|
{
|
||||||
index = maxIndexesBuffer[number];
|
index = maxIndexesBuffer[number];
|
||||||
max = maxValuesBuffer[number];
|
max = maxValuesBuffer[number];
|
||||||
@ -106,9 +108,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 8;
|
number = quarterPoints * 8;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
if(src0[number] > max)
|
if (src0[number] > max)
|
||||||
{
|
{
|
||||||
index = number;
|
index = number;
|
||||||
max = src0[number];
|
max = src0[number];
|
||||||
@ -126,7 +128,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
|
static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
uint32_t number = 0;
|
uint32_t number = 0;
|
||||||
const uint32_t quarterPoints = num_points / 8;
|
const uint32_t quarterPoints = num_points / 8;
|
||||||
@ -134,7 +136,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
|
|||||||
float* inputPtr = (float*)src0;
|
float* inputPtr = (float*)src0;
|
||||||
|
|
||||||
__m256 indexIncrementValues = _mm256_set1_ps(8);
|
__m256 indexIncrementValues = _mm256_set1_ps(8);
|
||||||
__m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
|
__m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
|
||||||
|
|
||||||
float max = src0[0];
|
float max = src0[0];
|
||||||
float index = 0;
|
float index = 0;
|
||||||
@ -143,25 +145,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
|
|||||||
__m256 compareResults;
|
__m256 compareResults;
|
||||||
__m256 currentValues;
|
__m256 currentValues;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
|
float maxValuesBuffer[8];
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
float maxIndexesBuffer[8];
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
|
currentValues = _mm256_loadu_ps(inputPtr);
|
||||||
|
inputPtr += 8;
|
||||||
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
|
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
|
||||||
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
|
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
|
||||||
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
||||||
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
|
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the largest value from the remaining 8 points
|
// Calculate the largest value from the remaining 8 points
|
||||||
_mm256_store_ps(maxValuesBuffer, maxValues);
|
_mm256_store_ps(maxValuesBuffer, maxValues);
|
||||||
_mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
|
_mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
|
||||||
|
|
||||||
for(number = 0; number < 8; number++)
|
for (number = 0; number < 8; number++)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[number] > max)
|
if (maxValuesBuffer[number] > max)
|
||||||
{
|
{
|
||||||
index = maxIndexesBuffer[number];
|
index = maxIndexesBuffer[number];
|
||||||
max = maxValuesBuffer[number];
|
max = maxValuesBuffer[number];
|
||||||
@ -169,9 +174,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 8;
|
number = quarterPoints * 8;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
if(src0[number] > max)
|
if (src0[number] > max)
|
||||||
{
|
{
|
||||||
index = number;
|
index = number;
|
||||||
max = src0[number];
|
max = src0[number];
|
||||||
@ -185,11 +190,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE4_1
|
#ifdef LV_HAVE_SSE4_1
|
||||||
#include<smmintrin.h>
|
#include <smmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
|
static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
uint32_t number = 0;
|
uint32_t number = 0;
|
||||||
const uint32_t quarterPoints = num_points / 4;
|
const uint32_t quarterPoints = num_points / 4;
|
||||||
@ -197,7 +202,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
|
|||||||
float* inputPtr = (float*)src0;
|
float* inputPtr = (float*)src0;
|
||||||
|
|
||||||
__m128 indexIncrementValues = _mm_set1_ps(4);
|
__m128 indexIncrementValues = _mm_set1_ps(4);
|
||||||
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
|
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
|
||||||
|
|
||||||
float max = src0[0];
|
float max = src0[0];
|
||||||
float index = 0;
|
float index = 0;
|
||||||
@ -206,25 +211,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
|
|||||||
__m128 compareResults;
|
__m128 compareResults;
|
||||||
__m128 currentValues;
|
__m128 currentValues;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
|
float maxValuesBuffer[4];
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float maxIndexesBuffer[4];
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
|
currentValues = _mm_load_ps(inputPtr);
|
||||||
|
inputPtr += 4;
|
||||||
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
||||||
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
||||||
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
||||||
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
|
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the largest value from the remaining 4 points
|
// Calculate the largest value from the remaining 4 points
|
||||||
_mm_store_ps(maxValuesBuffer, maxValues);
|
_mm_store_ps(maxValuesBuffer, maxValues);
|
||||||
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
||||||
|
|
||||||
for(number = 0; number < 4; number++)
|
for (number = 0; number < 4; number++)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[number] > max)
|
if (maxValuesBuffer[number] > max)
|
||||||
{
|
{
|
||||||
index = maxIndexesBuffer[number];
|
index = maxIndexesBuffer[number];
|
||||||
max = maxValuesBuffer[number];
|
max = maxValuesBuffer[number];
|
||||||
@ -232,9 +240,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 4;
|
number = quarterPoints * 4;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
if(src0[number] > max)
|
if (src0[number] > max)
|
||||||
{
|
{
|
||||||
index = number;
|
index = number;
|
||||||
max = src0[number];
|
max = src0[number];
|
||||||
@ -248,11 +256,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE4_1
|
#ifdef LV_HAVE_SSE4_1
|
||||||
#include<smmintrin.h>
|
#include <smmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
|
static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
uint32_t number = 0;
|
uint32_t number = 0;
|
||||||
const uint32_t quarterPoints = num_points / 4;
|
const uint32_t quarterPoints = num_points / 4;
|
||||||
@ -260,7 +268,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
|
|||||||
float* inputPtr = (float*)src0;
|
float* inputPtr = (float*)src0;
|
||||||
|
|
||||||
__m128 indexIncrementValues = _mm_set1_ps(4);
|
__m128 indexIncrementValues = _mm_set1_ps(4);
|
||||||
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
|
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
|
||||||
|
|
||||||
float max = src0[0];
|
float max = src0[0];
|
||||||
float index = 0;
|
float index = 0;
|
||||||
@ -269,25 +277,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
|
|||||||
__m128 compareResults;
|
__m128 compareResults;
|
||||||
__m128 currentValues;
|
__m128 currentValues;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
|
float maxValuesBuffer[4];
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float maxIndexesBuffer[4];
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
|
currentValues = _mm_loadu_ps(inputPtr);
|
||||||
|
inputPtr += 4;
|
||||||
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
||||||
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
||||||
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
|
||||||
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
|
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the largest value from the remaining 4 points
|
// Calculate the largest value from the remaining 4 points
|
||||||
_mm_store_ps(maxValuesBuffer, maxValues);
|
_mm_store_ps(maxValuesBuffer, maxValues);
|
||||||
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
||||||
|
|
||||||
for(number = 0; number < 4; number++)
|
for (number = 0; number < 4; number++)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[number] > max)
|
if (maxValuesBuffer[number] > max)
|
||||||
{
|
{
|
||||||
index = maxIndexesBuffer[number];
|
index = maxIndexesBuffer[number];
|
||||||
max = maxValuesBuffer[number];
|
max = maxValuesBuffer[number];
|
||||||
@ -295,9 +306,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 4;
|
number = quarterPoints * 4;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
if(src0[number] > max)
|
if (src0[number] > max)
|
||||||
{
|
{
|
||||||
index = number;
|
index = number;
|
||||||
max = src0[number];
|
max = src0[number];
|
||||||
@ -312,11 +323,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
|
|||||||
|
|
||||||
#ifdef LV_HAVE_SSE
|
#ifdef LV_HAVE_SSE
|
||||||
|
|
||||||
#include<xmmintrin.h>
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
|
static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
uint32_t number = 0;
|
uint32_t number = 0;
|
||||||
const uint32_t quarterPoints = num_points / 4;
|
const uint32_t quarterPoints = num_points / 4;
|
||||||
@ -324,7 +335,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
|
|||||||
float* inputPtr = (float*)src0;
|
float* inputPtr = (float*)src0;
|
||||||
|
|
||||||
__m128 indexIncrementValues = _mm_set1_ps(4);
|
__m128 indexIncrementValues = _mm_set1_ps(4);
|
||||||
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
|
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
|
||||||
|
|
||||||
float max = src0[0];
|
float max = src0[0];
|
||||||
float index = 0;
|
float index = 0;
|
||||||
@ -333,25 +344,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
|
|||||||
__m128 compareResults;
|
__m128 compareResults;
|
||||||
__m128 currentValues;
|
__m128 currentValues;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
|
float maxValuesBuffer[4];
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float maxIndexesBuffer[4];
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
|
currentValues = _mm_load_ps(inputPtr);
|
||||||
|
inputPtr += 4;
|
||||||
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
||||||
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
||||||
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
|
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes));
|
||||||
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
|
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the largest value from the remaining 4 points
|
// Calculate the largest value from the remaining 4 points
|
||||||
_mm_store_ps(maxValuesBuffer, maxValues);
|
_mm_store_ps(maxValuesBuffer, maxValues);
|
||||||
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
||||||
|
|
||||||
for(number = 0; number < 4; number++)
|
for (number = 0; number < 4; number++)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[number] > max)
|
if (maxValuesBuffer[number] > max)
|
||||||
{
|
{
|
||||||
index = maxIndexesBuffer[number];
|
index = maxIndexesBuffer[number];
|
||||||
max = maxValuesBuffer[number];
|
max = maxValuesBuffer[number];
|
||||||
@ -359,9 +373,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 4;
|
number = quarterPoints * 4;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
if(src0[number] > max)
|
if (src0[number] > max)
|
||||||
{
|
{
|
||||||
index = number;
|
index = number;
|
||||||
max = src0[number];
|
max = src0[number];
|
||||||
@ -376,11 +390,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
|
|||||||
|
|
||||||
#ifdef LV_HAVE_SSE
|
#ifdef LV_HAVE_SSE
|
||||||
|
|
||||||
#include<xmmintrin.h>
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
|
static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
uint32_t number = 0;
|
uint32_t number = 0;
|
||||||
const uint32_t quarterPoints = num_points / 4;
|
const uint32_t quarterPoints = num_points / 4;
|
||||||
@ -388,7 +402,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
|
|||||||
float* inputPtr = (float*)src0;
|
float* inputPtr = (float*)src0;
|
||||||
|
|
||||||
__m128 indexIncrementValues = _mm_set1_ps(4);
|
__m128 indexIncrementValues = _mm_set1_ps(4);
|
||||||
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
|
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
|
||||||
|
|
||||||
float max = src0[0];
|
float max = src0[0];
|
||||||
float index = 0;
|
float index = 0;
|
||||||
@ -397,25 +411,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
|
|||||||
__m128 compareResults;
|
__m128 compareResults;
|
||||||
__m128 currentValues;
|
__m128 currentValues;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
|
float maxValuesBuffer[4];
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float maxIndexesBuffer[4];
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
|
currentValues = _mm_loadu_ps(inputPtr);
|
||||||
|
inputPtr += 4;
|
||||||
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
|
||||||
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
|
||||||
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
|
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes));
|
||||||
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
|
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the largest value from the remaining 4 points
|
// Calculate the largest value from the remaining 4 points
|
||||||
_mm_store_ps(maxValuesBuffer, maxValues);
|
_mm_store_ps(maxValuesBuffer, maxValues);
|
||||||
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
|
||||||
|
|
||||||
for(number = 0; number < 4; number++)
|
for (number = 0; number < 4; number++)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[number] > max)
|
if (maxValuesBuffer[number] > max)
|
||||||
{
|
{
|
||||||
index = maxIndexesBuffer[number];
|
index = maxIndexesBuffer[number];
|
||||||
max = maxValuesBuffer[number];
|
max = maxValuesBuffer[number];
|
||||||
@ -423,9 +440,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 4;
|
number = quarterPoints * 4;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
if(src0[number] > max)
|
if (src0[number] > max)
|
||||||
{
|
{
|
||||||
index = number;
|
index = number;
|
||||||
max = src0[number];
|
max = src0[number];
|
||||||
@ -442,16 +459,16 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
|
static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
float max = src0[0];
|
float max = src0[0];
|
||||||
uint32_t index = 0;
|
uint32_t index = 0;
|
||||||
|
|
||||||
uint32_t i = 1;
|
uint32_t i = 1;
|
||||||
|
|
||||||
for(; i < num_points; ++i)
|
for (; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -469,14 +486,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, cons
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
|
static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
uint32_t number = 0;
|
uint32_t number = 0;
|
||||||
const uint32_t quarterPoints = num_points / 4;
|
const uint32_t quarterPoints = num_points / 4;
|
||||||
|
|
||||||
float* inputPtr = (float*)src0;
|
float* inputPtr = (float*)src0;
|
||||||
float32x4_t indexIncrementValues = vdupq_n_f32(4);
|
float32x4_t indexIncrementValues = vdupq_n_f32(4);
|
||||||
__VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float currentIndexes_float[4] = {-4.0f, -3.0f, -2.0f, -1.0f};
|
||||||
float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
|
float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
|
||||||
|
|
||||||
float max = src0[0];
|
float max = src0[0];
|
||||||
@ -487,25 +505,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
|
|||||||
uint32x4_t currentIndexes_u;
|
uint32x4_t currentIndexes_u;
|
||||||
float32x4_t currentValues;
|
float32x4_t currentValues;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
|
float maxValuesBuffer[4];
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float maxIndexesBuffer[4];
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
currentValues = vld1q_f32(inputPtr); inputPtr += 4;
|
currentValues = vld1q_f32(inputPtr);
|
||||||
currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
|
inputPtr += 4;
|
||||||
|
currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
|
||||||
currentIndexes_u = vcvtq_u32_f32(currentIndexes);
|
currentIndexes_u = vcvtq_u32_f32(currentIndexes);
|
||||||
compareResults = vcgtq_f32( maxValues, currentValues);
|
compareResults = vcgtq_f32(maxValues, currentValues);
|
||||||
maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
|
maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex), vbicq_u32(currentIndexes_u, compareResults));
|
||||||
maxValues = vmaxq_f32(currentValues, maxValues);
|
maxValues = vmaxq_f32(currentValues, maxValues);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the largest value from the remaining 4 points
|
// Calculate the largest value from the remaining 4 points
|
||||||
vst1q_f32(maxValuesBuffer, maxValues);
|
vst1q_f32(maxValuesBuffer, maxValues);
|
||||||
vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
|
vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
|
||||||
for(number = 0; number < 4; number++)
|
for (number = 0; number < 4; number++)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[number] > max)
|
if (maxValuesBuffer[number] > max)
|
||||||
{
|
{
|
||||||
index = maxIndexesBuffer[number];
|
index = maxIndexesBuffer[number];
|
||||||
max = maxValuesBuffer[number];
|
max = maxValuesBuffer[number];
|
||||||
@ -513,9 +534,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 4;
|
number = quarterPoints * 4;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
if(src0[number] > max)
|
if (src0[number] > max)
|
||||||
{
|
{
|
||||||
index = number;
|
index = number;
|
||||||
max = src0[number];
|
max = src0[number];
|
||||||
@ -528,4 +549,3 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
|
|||||||
#endif /*LV_HAVE_NEON*/
|
#endif /*LV_HAVE_NEON*/
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/
|
#endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/
|
||||||
|
|
||||||
|
@ -42,31 +42,30 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -77,26 +76,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result,
|
|||||||
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -106,26 +105,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result,
|
|||||||
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,26 +135,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result,
|
|||||||
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -165,26 +164,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result
|
|||||||
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -194,26 +193,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result
|
|||||||
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -223,26 +222,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, c
|
|||||||
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -251,29 +250,28 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, c
|
|||||||
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H
|
#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H
|
||||||
|
|
||||||
|
@ -97,7 +97,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
|
|||||||
cp4 = _mm_set1_ps(0.49603e-4);
|
cp4 = _mm_set1_ps(0.49603e-4);
|
||||||
cp5 = _mm_set1_ps(0.551e-6);
|
cp5 = _mm_set1_ps(0.551e-6);
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm_loadu_ps(aPtr);
|
aVal = _mm_loadu_ps(aPtr);
|
||||||
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
||||||
@ -108,12 +108,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
|
|||||||
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
|
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
|
||||||
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
|
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
|
||||||
|
|
||||||
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
|
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
|
||||||
s = _mm_mul_ps(s, s);
|
s = _mm_mul_ps(s, s);
|
||||||
// Evaluate Taylor series
|
// Evaluate Taylor series
|
||||||
s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
|
s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
|
||||||
|
|
||||||
for(i = 0; i < 3; i++)
|
for (i = 0; i < 3; i++)
|
||||||
{
|
{
|
||||||
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
|
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
|
||||||
}
|
}
|
||||||
@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 4;
|
number = quarterPoints * 4;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
float _in = *aPtr++;
|
float _in = *aPtr++;
|
||||||
*bPtr++ = lv_cmake(cosf(_in), sinf(_in));
|
*bPtr++ = lv_cmake(cosf(_in), sinf(_in));
|
||||||
@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
|
|||||||
cp4 = _mm_set1_ps(0.49603e-4);
|
cp4 = _mm_set1_ps(0.49603e-4);
|
||||||
cp5 = _mm_set1_ps(0.551e-6);
|
cp5 = _mm_set1_ps(0.551e-6);
|
||||||
|
|
||||||
for(;number < quarterPoints; number++)
|
for (; number < quarterPoints; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm_load_ps(aPtr);
|
aVal = _mm_load_ps(aPtr);
|
||||||
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
||||||
@ -202,12 +202,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
|
|||||||
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
|
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
|
||||||
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
|
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
|
||||||
|
|
||||||
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
|
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
|
||||||
s = _mm_mul_ps(s, s);
|
s = _mm_mul_ps(s, s);
|
||||||
// Evaluate Taylor series
|
// Evaluate Taylor series
|
||||||
s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
|
s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
|
||||||
|
|
||||||
for(i = 0; i < 3; i++)
|
for (i = 0; i < 3; i++)
|
||||||
{
|
{
|
||||||
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
|
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
|
||||||
}
|
}
|
||||||
@ -239,7 +239,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
|
|||||||
}
|
}
|
||||||
|
|
||||||
number = quarterPoints * 4;
|
number = quarterPoints * 4;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
float _in = *aPtr++;
|
float _in = *aPtr++;
|
||||||
*bPtr++ = lv_cmake(cosf(_in), sinf(_in));
|
*bPtr++ = lv_cmake(cosf(_in), sinf(_in));
|
||||||
@ -265,31 +265,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
|
|||||||
__m128 sine, cosine, aux, x;
|
__m128 sine, cosine, aux, x;
|
||||||
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
|
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
|
||||||
|
|
||||||
__m128i emm0, emm2, emm4;
|
__m128i emm0, emm2, emm4;
|
||||||
|
|
||||||
/* declare some SSE constants */
|
/* declare some SSE constants */
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
|
static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
|
static const int _pi32_1[4] = {1, 1, 1, 1};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_2[4] = {2, 2, 2, 2};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_4[4] = {4, 4, 4, 4};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
|
static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
|
||||||
for(;number < sse_iters; number++)
|
for (; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_load_ps(aPtr);
|
x = _mm_load_ps(aPtr);
|
||||||
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
||||||
@ -307,19 +325,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
|
|||||||
emm2 = _mm_cvttps_epi32(y);
|
emm2 = _mm_cvttps_epi32(y);
|
||||||
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||||
emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1);
|
emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
|
||||||
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1);
|
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
|
||||||
y = _mm_cvtepi32_ps(emm2);
|
y = _mm_cvtepi32_ps(emm2);
|
||||||
|
|
||||||
emm4 = emm2;
|
emm4 = emm2;
|
||||||
|
|
||||||
/* get the swap sign flag for the sine */
|
/* get the swap sign flag for the sine */
|
||||||
emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4);
|
emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
|
||||||
emm0 = _mm_slli_epi32(emm0, 29);
|
emm0 = _mm_slli_epi32(emm0, 29);
|
||||||
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
||||||
|
|
||||||
/* get the polynom selection mask for the sine*/
|
/* get the polynom selection mask for the sine*/
|
||||||
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2);
|
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
|
||||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||||
__m128 poly_mask = _mm_castsi128_ps(emm2);
|
__m128 poly_mask = _mm_castsi128_ps(emm2);
|
||||||
|
|
||||||
@ -335,15 +353,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
|
|||||||
x = _mm_add_ps(x, xmm2);
|
x = _mm_add_ps(x, xmm2);
|
||||||
x = _mm_add_ps(x, xmm3);
|
x = _mm_add_ps(x, xmm3);
|
||||||
|
|
||||||
emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2);
|
emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
|
||||||
emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4);
|
emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
|
||||||
emm4 = _mm_slli_epi32(emm4, 29);
|
emm4 = _mm_slli_epi32(emm4, 29);
|
||||||
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
|
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
|
||||||
|
|
||||||
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
__m128 z = _mm_mul_ps(x,x);
|
__m128 z = _mm_mul_ps(x, x);
|
||||||
y = *(__m128*)_ps_coscof_p0;
|
y = *(__m128*)_ps_coscof_p0;
|
||||||
|
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
@ -371,11 +389,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
|
|||||||
xmm3 = poly_mask;
|
xmm3 = poly_mask;
|
||||||
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
||||||
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
||||||
y2 = _mm_sub_ps(y2,ysin2);
|
y2 = _mm_sub_ps(y2, ysin2);
|
||||||
y = _mm_sub_ps(y, ysin1);
|
y = _mm_sub_ps(y, ysin1);
|
||||||
|
|
||||||
xmm1 = _mm_add_ps(ysin1,ysin2);
|
xmm1 = _mm_add_ps(ysin1, ysin2);
|
||||||
xmm2 = _mm_add_ps(y,y2);
|
xmm2 = _mm_add_ps(y, y2);
|
||||||
|
|
||||||
/* update the sign */
|
/* update the sign */
|
||||||
sine = _mm_xor_ps(xmm1, sign_bit_sin);
|
sine = _mm_xor_ps(xmm1, sign_bit_sin);
|
||||||
@ -392,12 +410,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
|
|||||||
aPtr += 4;
|
aPtr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = sse_iters * 4; number < num_points; number++)
|
for (number = sse_iters * 4; number < num_points; number++)
|
||||||
{
|
{
|
||||||
_in = *aPtr++;
|
_in = *aPtr++;
|
||||||
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) );
|
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
|
||||||
@ -418,31 +435,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
|
|||||||
__m128 sine, cosine, aux, x;
|
__m128 sine, cosine, aux, x;
|
||||||
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
|
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
|
||||||
|
|
||||||
__m128i emm0, emm2, emm4;
|
__m128i emm0, emm2, emm4;
|
||||||
|
|
||||||
/* declare some SSE constants */
|
/* declare some SSE constants */
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
|
static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
|
static const int _pi32_1[4] = {1, 1, 1, 1};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_2[4] = {2, 2, 2, 2};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_4[4] = {4, 4, 4, 4};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
|
static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
|
||||||
for(;number < sse_iters; number++)
|
for (; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_loadu_ps(aPtr);
|
x = _mm_loadu_ps(aPtr);
|
||||||
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
||||||
@ -460,19 +495,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
|
|||||||
emm2 = _mm_cvttps_epi32(y);
|
emm2 = _mm_cvttps_epi32(y);
|
||||||
|
|
||||||
/* j=(j+1) & (~1) (see the cephes sources) */
|
/* j=(j+1) & (~1) (see the cephes sources) */
|
||||||
emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1);
|
emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
|
||||||
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1);
|
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
|
||||||
y = _mm_cvtepi32_ps(emm2);
|
y = _mm_cvtepi32_ps(emm2);
|
||||||
|
|
||||||
emm4 = emm2;
|
emm4 = emm2;
|
||||||
|
|
||||||
/* get the swap sign flag for the sine */
|
/* get the swap sign flag for the sine */
|
||||||
emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4);
|
emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
|
||||||
emm0 = _mm_slli_epi32(emm0, 29);
|
emm0 = _mm_slli_epi32(emm0, 29);
|
||||||
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
|
||||||
|
|
||||||
/* get the polynom selection mask for the sine*/
|
/* get the polynom selection mask for the sine*/
|
||||||
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2);
|
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
|
||||||
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
|
||||||
__m128 poly_mask = _mm_castsi128_ps(emm2);
|
__m128 poly_mask = _mm_castsi128_ps(emm2);
|
||||||
|
|
||||||
@ -488,15 +523,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
|
|||||||
x = _mm_add_ps(x, xmm2);
|
x = _mm_add_ps(x, xmm2);
|
||||||
x = _mm_add_ps(x, xmm3);
|
x = _mm_add_ps(x, xmm3);
|
||||||
|
|
||||||
emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2);
|
emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
|
||||||
emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4);
|
emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
|
||||||
emm4 = _mm_slli_epi32(emm4, 29);
|
emm4 = _mm_slli_epi32(emm4, 29);
|
||||||
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
|
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
|
||||||
|
|
||||||
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
__m128 z = _mm_mul_ps(x,x);
|
__m128 z = _mm_mul_ps(x, x);
|
||||||
y = *(__m128*)_ps_coscof_p0;
|
y = *(__m128*)_ps_coscof_p0;
|
||||||
|
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
@ -524,11 +559,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
|
|||||||
xmm3 = poly_mask;
|
xmm3 = poly_mask;
|
||||||
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
||||||
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
||||||
y2 = _mm_sub_ps(y2,ysin2);
|
y2 = _mm_sub_ps(y2, ysin2);
|
||||||
y = _mm_sub_ps(y, ysin1);
|
y = _mm_sub_ps(y, ysin1);
|
||||||
|
|
||||||
xmm1 = _mm_add_ps(ysin1,ysin2);
|
xmm1 = _mm_add_ps(ysin1, ysin2);
|
||||||
xmm2 = _mm_add_ps(y,y2);
|
xmm2 = _mm_add_ps(y, y2);
|
||||||
|
|
||||||
/* update the sign */
|
/* update the sign */
|
||||||
sine = _mm_xor_ps(xmm1, sign_bit_sin);
|
sine = _mm_xor_ps(xmm1, sign_bit_sin);
|
||||||
@ -545,12 +580,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
|
|||||||
aPtr += 4;
|
aPtr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = sse_iters * 4; number < num_points; number++)
|
for (number = sse_iters * 4; number < num_points; number++)
|
||||||
{
|
{
|
||||||
_in = *aPtr++;
|
_in = *aPtr++;
|
||||||
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) );
|
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
|
||||||
@ -561,10 +595,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic(lv_32fc_t* out, const fl
|
|||||||
{
|
{
|
||||||
float _in;
|
float _in;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i = 0; i < num_points; i++)
|
for (i = 0; i < num_points; i++)
|
||||||
{
|
{
|
||||||
_in = *in++;
|
_in = *in++;
|
||||||
*out++ = lv_cmake((float)cosf(_in), (float)sinf(_in) );
|
*out++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -586,12 +620,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
|
|||||||
const int32_t diffbits = bitlength - Nbits;
|
const int32_t diffbits = bitlength - Nbits;
|
||||||
uint32_t ux;
|
uint32_t ux;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i = 0; i < num_points; i++)
|
for (i = 0; i < num_points; i++)
|
||||||
{
|
{
|
||||||
_in = *in++;
|
_in = *in++;
|
||||||
d = (int32_t)floor(_in / TWO_PI + 0.5);
|
d = (int32_t)floor(_in / TWO_PI + 0.5);
|
||||||
_in -= d * TWO_PI;
|
_in -= d * TWO_PI;
|
||||||
x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI);
|
x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI);
|
||||||
|
|
||||||
ux = x;
|
ux = x;
|
||||||
sin_index = ux >> diffbits;
|
sin_index = ux >> diffbits;
|
||||||
@ -601,7 +635,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
|
|||||||
cos_index = ux >> diffbits;
|
cos_index = ux >> diffbits;
|
||||||
c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1];
|
c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1];
|
||||||
|
|
||||||
*out++ = lv_cmake((float)c, (float)s );
|
*out++ = lv_cmake((float)c, (float)s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -637,7 +671,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
|
|||||||
|
|
||||||
uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos;
|
uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos;
|
||||||
|
|
||||||
for(;number < neon_iters; number++)
|
for (; number < neon_iters; number++)
|
||||||
{
|
{
|
||||||
x = vld1q_f32(aPtr);
|
x = vld1q_f32(aPtr);
|
||||||
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
|
||||||
@ -677,7 +711,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
|
|||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
|
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
|
||||||
and the second polynom (Pi/4 <= x <= 0) in y2 */
|
and the second polynom (Pi/4 <= x <= 0) in y2 */
|
||||||
z = vmulq_f32(x,x);
|
z = vmulq_f32(x, x);
|
||||||
|
|
||||||
y1 = vmulq_n_f32(z, c_coscof_p0);
|
y1 = vmulq_n_f32(z, c_coscof_p0);
|
||||||
y2 = vmulq_n_f32(z, c_sincof_p0);
|
y2 = vmulq_n_f32(z, c_sincof_p0);
|
||||||
@ -706,10 +740,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
|
|||||||
aPtr += 4;
|
aPtr += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = neon_iters * 4; number < num_points; number++)
|
for (number = neon_iters * 4; number < num_points; number++)
|
||||||
{
|
{
|
||||||
_in = *aPtr++;
|
_in = *aPtr++;
|
||||||
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) );
|
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,7 +110,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -124,7 +125,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -145,25 +146,25 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
@ -180,7 +181,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -194,7 +196,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -215,18 +217,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -248,7 +250,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -262,7 +265,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -280,25 +283,25 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE4_1
|
#ifdef LV_HAVE_SSE4_1
|
||||||
@ -314,7 +317,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -328,7 +332,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -346,18 +350,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -380,7 +384,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -395,7 +400,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -413,13 +418,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -429,12 +434,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -457,7 +462,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -472,7 +478,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -490,13 +496,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -506,12 +512,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -536,19 +542,21 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
|
|||||||
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
||||||
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int32_t local_code_chip_index[4];
|
||||||
int32_t local_code_chip_index_;
|
int32_t local_code_chip_index_;
|
||||||
|
|
||||||
const int32x4_t zeros = vdupq_n_s32(0);
|
const int32x4_t zeros = vdupq_n_s32(0);
|
||||||
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
|
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
|
||||||
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
||||||
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
||||||
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
||||||
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
uint32x4_t igx;
|
uint32x4_t igx;
|
||||||
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
||||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
||||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
||||||
float32x4_t n0 = vld1q_f32((float*)vec);
|
float32x4_t n0 = vld1q_f32((float*)vec);
|
||||||
|
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
@ -556,7 +564,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
|
|||||||
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < neon_iters; n++)
|
for (n = 0; n < neon_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
|
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
|
||||||
@ -572,7 +580,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
|
|||||||
|
|
||||||
// fmod
|
// fmod
|
||||||
c = vmulq_f32(aux, reciprocal);
|
c = vmulq_f32(aux, reciprocal);
|
||||||
i = vcvtq_s32_f32(c);
|
i = vcvtq_s32_f32(c);
|
||||||
cTrunc = vcvtq_f32_s32(i);
|
cTrunc = vcvtq_f32_s32(i);
|
||||||
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
||||||
aux = vsubq_f32(aux, base);
|
aux = vsubq_f32(aux, base);
|
||||||
@ -584,13 +592,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
|
|||||||
|
|
||||||
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
|
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = vaddq_f32(indexn, fours);
|
indexn = vaddq_f32(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = neon_iters * 4; n < num_points; n++)
|
for (n = neon_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
@ -606,5 +614,3 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/
|
#endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/
|
||||||
|
|
||||||
|
|
||||||
|
@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
result[n_vec] = lv_cmake(0,0);
|
result[n_vec] = lv_cmake(0, 0);
|
||||||
}
|
}
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
|
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if (n % 256 == 0)
|
if (n % 256 == 0)
|
||||||
@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
|
|||||||
unsigned int j;
|
unsigned int j;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
result[n_vec] = lv_cmake(0,0);
|
result[n_vec] = lv_cmake(0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
|
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
|
||||||
@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
|
|||||||
result[n_vec] += tmp32_2;
|
result[n_vec] += tmp32_2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Regenerate phase */
|
/* Regenerate phase */
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
@ -175,8 +175,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
|
|||||||
const unsigned int sixteenthPoints = num_points / 16;
|
const unsigned int sixteenthPoints = num_points / 16;
|
||||||
|
|
||||||
const float* aPtr = (float*)in_common;
|
const float* aPtr = (float*)in_common;
|
||||||
const float* bPtr[ num_a_vectors];
|
const float* bPtr[num_a_vectors];
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
|
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
|
||||||
{
|
{
|
||||||
bPtr[vec_ind] = in_a[vec_ind];
|
bPtr[vec_ind] = in_a[vec_ind];
|
||||||
}
|
}
|
||||||
@ -194,7 +194,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
|
|||||||
__m256 dotProdVal2[num_a_vectors];
|
__m256 dotProdVal2[num_a_vectors];
|
||||||
__m256 dotProdVal3[num_a_vectors];
|
__m256 dotProdVal3[num_a_vectors];
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ )
|
for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++)
|
||||||
{
|
{
|
||||||
dotProdVal0[vec_ind] = _mm256_setzero_ps();
|
dotProdVal0[vec_ind] = _mm256_setzero_ps();
|
||||||
dotProdVal1[vec_ind] = _mm256_setzero_ps();
|
dotProdVal1[vec_ind] = _mm256_setzero_ps();
|
||||||
@ -204,57 +204,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
|
|||||||
|
|
||||||
// Set up the complex rotator
|
// Set up the complex rotator
|
||||||
__m256 z0, z1, z2, z3;
|
__m256 z0, z1, z2, z3;
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
for( vec_ind = 0; vec_ind < 16; ++vec_ind )
|
lv_32fc_t phase_vec[16];
|
||||||
|
for (vec_ind = 0; vec_ind < 16; ++vec_ind)
|
||||||
{
|
{
|
||||||
phase_vec[vec_ind] = _phase;
|
phase_vec[vec_ind] = _phase;
|
||||||
_phase *= phase_inc;
|
_phase *= phase_inc;
|
||||||
}
|
}
|
||||||
|
|
||||||
z0 = _mm256_load_ps( (float *)phase_vec );
|
z0 = _mm256_load_ps((float*)phase_vec);
|
||||||
z1 = _mm256_load_ps( (float *)(phase_vec + 4) );
|
z1 = _mm256_load_ps((float*)(phase_vec + 4));
|
||||||
z2 = _mm256_load_ps( (float *)(phase_vec + 8) );
|
z2 = _mm256_load_ps((float*)(phase_vec + 8));
|
||||||
z3 = _mm256_load_ps( (float *)(phase_vec + 12) );
|
z3 = _mm256_load_ps((float*)(phase_vec + 12));
|
||||||
|
|
||||||
lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16;
|
lv_32fc_t dz = phase_inc;
|
||||||
|
dz *= dz;
|
||||||
|
dz *= dz;
|
||||||
|
dz *= dz;
|
||||||
|
dz *= dz; // dz = phase_inc^16;
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < 4; ++vec_ind )
|
for (vec_ind = 0; vec_ind < 4; ++vec_ind)
|
||||||
{
|
{
|
||||||
phase_vec[vec_ind] = dz;
|
phase_vec[vec_ind] = dz;
|
||||||
}
|
}
|
||||||
|
|
||||||
__m256 dz_reg = _mm256_load_ps( (float *)phase_vec );
|
__m256 dz_reg = _mm256_load_ps((float*)phase_vec);
|
||||||
dz_reg = _mm256_complexnormalise_ps( dz_reg );
|
dz_reg = _mm256_complexnormalise_ps(dz_reg);
|
||||||
|
|
||||||
for(;number < sixteenthPoints; number++)
|
for (; number < sixteenthPoints; number++)
|
||||||
{
|
{
|
||||||
a0Val = _mm256_loadu_ps(aPtr);
|
a0Val = _mm256_loadu_ps(aPtr);
|
||||||
a1Val = _mm256_loadu_ps(aPtr+8);
|
a1Val = _mm256_loadu_ps(aPtr + 8);
|
||||||
a2Val = _mm256_loadu_ps(aPtr+16);
|
a2Val = _mm256_loadu_ps(aPtr + 16);
|
||||||
a3Val = _mm256_loadu_ps(aPtr+24);
|
a3Val = _mm256_loadu_ps(aPtr + 24);
|
||||||
|
|
||||||
a0Val = _mm256_complexmul_ps( a0Val, z0 );
|
a0Val = _mm256_complexmul_ps(a0Val, z0);
|
||||||
a1Val = _mm256_complexmul_ps( a1Val, z1 );
|
a1Val = _mm256_complexmul_ps(a1Val, z1);
|
||||||
a2Val = _mm256_complexmul_ps( a2Val, z2 );
|
a2Val = _mm256_complexmul_ps(a2Val, z2);
|
||||||
a3Val = _mm256_complexmul_ps( a3Val, z3 );
|
a3Val = _mm256_complexmul_ps(a3Val, z3);
|
||||||
|
|
||||||
z0 = _mm256_complexmul_ps( z0, dz_reg );
|
z0 = _mm256_complexmul_ps(z0, dz_reg);
|
||||||
z1 = _mm256_complexmul_ps( z1, dz_reg );
|
z1 = _mm256_complexmul_ps(z1, dz_reg);
|
||||||
z2 = _mm256_complexmul_ps( z2, dz_reg );
|
z2 = _mm256_complexmul_ps(z2, dz_reg);
|
||||||
z3 = _mm256_complexmul_ps( z3, dz_reg );
|
z3 = _mm256_complexmul_ps(z3, dz_reg);
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
|
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
|
||||||
{
|
{
|
||||||
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
|
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
|
||||||
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8);
|
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8);
|
||||||
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
|
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
|
||||||
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
|
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
|
||||||
x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
|
x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
|
||||||
x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
|
x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
|
||||||
|
|
||||||
// TODO: it may be possible to rearrange swizzling to better pipeline data
|
// TODO: it may be possible to rearrange swizzling to better pipeline data
|
||||||
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
|
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
|
||||||
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
|
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
|
||||||
b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
|
b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
|
||||||
b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
|
b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
|
||||||
|
|
||||||
@ -274,43 +279,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
|
|||||||
// Force the rotators back onto the unit circle
|
// Force the rotators back onto the unit circle
|
||||||
if ((number % 64) == 0)
|
if ((number % 64) == 0)
|
||||||
{
|
{
|
||||||
z0 = _mm256_complexnormalise_ps( z0 );
|
z0 = _mm256_complexnormalise_ps(z0);
|
||||||
z1 = _mm256_complexnormalise_ps( z1 );
|
z1 = _mm256_complexnormalise_ps(z1);
|
||||||
z2 = _mm256_complexnormalise_ps( z2 );
|
z2 = _mm256_complexnormalise_ps(z2);
|
||||||
z3 = _mm256_complexnormalise_ps( z3 );
|
z3 = _mm256_complexnormalise_ps(z3);
|
||||||
}
|
}
|
||||||
|
|
||||||
aPtr += 32;
|
aPtr += 32;
|
||||||
}
|
}
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_32fc_t dotProductVector[4];
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
|
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
|
||||||
{
|
{
|
||||||
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
|
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
|
||||||
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
|
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
|
||||||
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
|
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
|
||||||
|
|
||||||
_mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
|
_mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
|
||||||
|
|
||||||
result[ vec_ind ] = lv_cmake( 0, 0 );
|
result[vec_ind] = lv_cmake(0, 0);
|
||||||
for( i = 0; i < 4; ++i )
|
for (i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
result[vec_ind] += dotProductVector[i];
|
result[vec_ind] += dotProductVector[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
z0 = _mm256_complexnormalise_ps( z0 );
|
z0 = _mm256_complexnormalise_ps(z0);
|
||||||
_mm256_store_ps((float*)phase_vec, z0);
|
_mm256_store_ps((float*)phase_vec, z0);
|
||||||
_phase = phase_vec[0];
|
_phase = phase_vec[0];
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
|
|
||||||
number = sixteenthPoints*16;
|
number = sixteenthPoints * 16;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
wo = (*aPtr++)*_phase;
|
wo = (*aPtr++) * _phase;
|
||||||
_phase *= phase_inc;
|
_phase *= phase_inc;
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
|
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
|
||||||
{
|
{
|
||||||
result[vec_ind] += wo * in_a[vec_ind][number];
|
result[vec_ind] += wo * in_a[vec_ind][number];
|
||||||
}
|
}
|
||||||
@ -333,8 +339,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
|
|||||||
const unsigned int sixteenthPoints = num_points / 16;
|
const unsigned int sixteenthPoints = num_points / 16;
|
||||||
|
|
||||||
const float* aPtr = (float*)in_common;
|
const float* aPtr = (float*)in_common;
|
||||||
const float* bPtr[ num_a_vectors];
|
const float* bPtr[num_a_vectors];
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
|
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
|
||||||
{
|
{
|
||||||
bPtr[vec_ind] = in_a[vec_ind];
|
bPtr[vec_ind] = in_a[vec_ind];
|
||||||
}
|
}
|
||||||
@ -352,7 +358,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
|
|||||||
__m256 dotProdVal2[num_a_vectors];
|
__m256 dotProdVal2[num_a_vectors];
|
||||||
__m256 dotProdVal3[num_a_vectors];
|
__m256 dotProdVal3[num_a_vectors];
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ )
|
for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++)
|
||||||
{
|
{
|
||||||
dotProdVal0[vec_ind] = _mm256_setzero_ps();
|
dotProdVal0[vec_ind] = _mm256_setzero_ps();
|
||||||
dotProdVal1[vec_ind] = _mm256_setzero_ps();
|
dotProdVal1[vec_ind] = _mm256_setzero_ps();
|
||||||
@ -362,58 +368,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
|
|||||||
|
|
||||||
// Set up the complex rotator
|
// Set up the complex rotator
|
||||||
__m256 z0, z1, z2, z3;
|
__m256 z0, z1, z2, z3;
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
for( vec_ind = 0; vec_ind < 16; ++vec_ind )
|
lv_32fc_t phase_vec[16];
|
||||||
|
for (vec_ind = 0; vec_ind < 16; ++vec_ind)
|
||||||
{
|
{
|
||||||
phase_vec[vec_ind] = _phase;
|
phase_vec[vec_ind] = _phase;
|
||||||
_phase *= phase_inc;
|
_phase *= phase_inc;
|
||||||
}
|
}
|
||||||
|
|
||||||
z0 = _mm256_load_ps( (float *)phase_vec );
|
z0 = _mm256_load_ps((float*)phase_vec);
|
||||||
z1 = _mm256_load_ps( (float *)(phase_vec + 4) );
|
z1 = _mm256_load_ps((float*)(phase_vec + 4));
|
||||||
z2 = _mm256_load_ps( (float *)(phase_vec + 8) );
|
z2 = _mm256_load_ps((float*)(phase_vec + 8));
|
||||||
z3 = _mm256_load_ps( (float *)(phase_vec + 12) );
|
z3 = _mm256_load_ps((float*)(phase_vec + 12));
|
||||||
|
|
||||||
lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16;
|
lv_32fc_t dz = phase_inc;
|
||||||
|
dz *= dz;
|
||||||
|
dz *= dz;
|
||||||
|
dz *= dz;
|
||||||
|
dz *= dz; // dz = phase_inc^16;
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < 4; ++vec_ind )
|
for (vec_ind = 0; vec_ind < 4; ++vec_ind)
|
||||||
{
|
{
|
||||||
phase_vec[vec_ind] = dz;
|
phase_vec[vec_ind] = dz;
|
||||||
}
|
}
|
||||||
|
|
||||||
__m256 dz_reg = _mm256_load_ps( (float *)phase_vec );
|
__m256 dz_reg = _mm256_load_ps((float*)phase_vec);
|
||||||
dz_reg = _mm256_complexnormalise_ps( dz_reg );
|
dz_reg = _mm256_complexnormalise_ps(dz_reg);
|
||||||
|
|
||||||
for(;number < sixteenthPoints; number++)
|
for (; number < sixteenthPoints; number++)
|
||||||
{
|
{
|
||||||
|
|
||||||
a0Val = _mm256_load_ps(aPtr);
|
a0Val = _mm256_load_ps(aPtr);
|
||||||
a1Val = _mm256_load_ps(aPtr+8);
|
a1Val = _mm256_load_ps(aPtr + 8);
|
||||||
a2Val = _mm256_load_ps(aPtr+16);
|
a2Val = _mm256_load_ps(aPtr + 16);
|
||||||
a3Val = _mm256_load_ps(aPtr+24);
|
a3Val = _mm256_load_ps(aPtr + 24);
|
||||||
|
|
||||||
a0Val = _mm256_complexmul_ps( a0Val, z0 );
|
a0Val = _mm256_complexmul_ps(a0Val, z0);
|
||||||
a1Val = _mm256_complexmul_ps( a1Val, z1 );
|
a1Val = _mm256_complexmul_ps(a1Val, z1);
|
||||||
a2Val = _mm256_complexmul_ps( a2Val, z2 );
|
a2Val = _mm256_complexmul_ps(a2Val, z2);
|
||||||
a3Val = _mm256_complexmul_ps( a3Val, z3 );
|
a3Val = _mm256_complexmul_ps(a3Val, z3);
|
||||||
|
|
||||||
z0 = _mm256_complexmul_ps( z0, dz_reg );
|
z0 = _mm256_complexmul_ps(z0, dz_reg);
|
||||||
z1 = _mm256_complexmul_ps( z1, dz_reg );
|
z1 = _mm256_complexmul_ps(z1, dz_reg);
|
||||||
z2 = _mm256_complexmul_ps( z2, dz_reg );
|
z2 = _mm256_complexmul_ps(z2, dz_reg);
|
||||||
z3 = _mm256_complexmul_ps( z3, dz_reg );
|
z3 = _mm256_complexmul_ps(z3, dz_reg);
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
|
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
|
||||||
{
|
{
|
||||||
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
|
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
|
||||||
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8);
|
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8);
|
||||||
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
|
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
|
||||||
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
|
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
|
||||||
x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
|
x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
|
||||||
x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
|
x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
|
||||||
|
|
||||||
// TODO: it may be possible to rearrange swizzling to better pipeline data
|
// TODO: it may be possible to rearrange swizzling to better pipeline data
|
||||||
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
|
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
|
||||||
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
|
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
|
||||||
b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
|
b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
|
||||||
b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
|
b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
|
||||||
|
|
||||||
@ -433,43 +443,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
|
|||||||
// Force the rotators back onto the unit circle
|
// Force the rotators back onto the unit circle
|
||||||
if ((number % 64) == 0)
|
if ((number % 64) == 0)
|
||||||
{
|
{
|
||||||
z0 = _mm256_complexnormalise_ps( z0 );
|
z0 = _mm256_complexnormalise_ps(z0);
|
||||||
z1 = _mm256_complexnormalise_ps( z1 );
|
z1 = _mm256_complexnormalise_ps(z1);
|
||||||
z2 = _mm256_complexnormalise_ps( z2 );
|
z2 = _mm256_complexnormalise_ps(z2);
|
||||||
z3 = _mm256_complexnormalise_ps( z3 );
|
z3 = _mm256_complexnormalise_ps(z3);
|
||||||
}
|
}
|
||||||
|
|
||||||
aPtr += 32;
|
aPtr += 32;
|
||||||
}
|
}
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_32fc_t dotProductVector[4];
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
|
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
|
||||||
{
|
{
|
||||||
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
|
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
|
||||||
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
|
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
|
||||||
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
|
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
|
||||||
|
|
||||||
_mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
|
_mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
|
||||||
|
|
||||||
result[ vec_ind ] = lv_cmake( 0, 0 );
|
result[vec_ind] = lv_cmake(0, 0);
|
||||||
for( i = 0; i < 4; ++i )
|
for (i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
result[vec_ind] += dotProductVector[i];
|
result[vec_ind] += dotProductVector[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
z0 = _mm256_complexnormalise_ps( z0 );
|
z0 = _mm256_complexnormalise_ps(z0);
|
||||||
_mm256_store_ps((float*)phase_vec, z0);
|
_mm256_store_ps((float*)phase_vec, z0);
|
||||||
_phase = phase_vec[0];
|
_phase = phase_vec[0];
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
|
|
||||||
number = sixteenthPoints*16;
|
number = sixteenthPoints * 16;
|
||||||
for(;number < num_points; number++)
|
for (; number < num_points; number++)
|
||||||
{
|
{
|
||||||
wo = (*aPtr++)*_phase;
|
wo = (*aPtr++) * _phase;
|
||||||
_phase *= phase_inc;
|
_phase *= phase_inc;
|
||||||
|
|
||||||
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
|
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
|
||||||
{
|
{
|
||||||
result[vec_ind] += wo * in_a[vec_ind][number];
|
result[vec_ind] += wo * in_a[vec_ind][number];
|
||||||
}
|
}
|
||||||
@ -482,5 +493,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
|
|||||||
#endif /* LV_HAVE_AVX */
|
#endif /* LV_HAVE_AVX */
|
||||||
|
|
||||||
#endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */
|
#endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */
|
||||||
|
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@
|
|||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
|
|
||||||
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -53,15 +53,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv
|
|||||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
|
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -82,15 +82,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re
|
|||||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
|
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re
|
|||||||
#endif // Generic
|
#endif // Generic
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX
|
#ifdef LV_HAVE_AVX
|
||||||
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -111,15 +111,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
|
|||||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
|
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -130,7 +130,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX
|
#ifdef LV_HAVE_AVX
|
||||||
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -141,15 +141,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
|
|||||||
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
|
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -159,4 +159,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
|
|||||||
#endif // AVX
|
#endif // AVX
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H
|
#endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H
|
||||||
|
|
||||||
|
@ -80,10 +80,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
|
|||||||
const __m128 vmin_val = _mm_set_ps1(min_val);
|
const __m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
const __m128 vmax_val = _mm_set_ps1(max_val);
|
const __m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
|
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
||||||
|
|
||||||
// Clip
|
// Clip
|
||||||
@ -99,12 +101,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
|
|||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 8; i < num_points * 2; i++)
|
for (i = sse_iters * 8; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++;
|
aux = *inputVectorPtr++;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -128,15 +130,17 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
|||||||
const float max_val = (float)SHRT_MAX;
|
const float max_val = (float)SHRT_MAX;
|
||||||
|
|
||||||
__m128 inputVal1, inputVal2;
|
__m128 inputVal1, inputVal2;
|
||||||
__m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h?
|
__m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h?
|
||||||
__m128 ret1, ret2;
|
__m128 ret1, ret2;
|
||||||
const __m128 vmin_val = _mm_set_ps1(min_val);
|
const __m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
const __m128 vmax_val = _mm_set_ps1(max_val);
|
const __m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(i = 0;i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
|
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
||||||
|
|
||||||
// Clip
|
// Clip
|
||||||
@ -152,12 +156,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
|
|||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 8; i < num_points*2; i++)
|
for (i = sse_iters * 8; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++;
|
aux = *inputVectorPtr++;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -175,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
|
|||||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||||
float aux;
|
float aux;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
|
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
|
||||||
const float max_val = (float)SHRT_MAX;
|
const float max_val = (float)SHRT_MAX;
|
||||||
|
|
||||||
__m256 inputVal1, inputVal2;
|
__m256 inputVal1, inputVal2;
|
||||||
@ -184,10 +188,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
|
|||||||
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
||||||
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
||||||
|
|
||||||
for(i = 0; i < avx2_iters; i++)
|
for (i = 0; i < avx2_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVectorPtr += 8;
|
||||||
|
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 8;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
|
||||||
|
|
||||||
// Clip
|
// Clip
|
||||||
@ -204,12 +210,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
|
|||||||
outputVectorPtr += 16;
|
outputVectorPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = avx2_iters * 16; i < num_points * 2; i++)
|
for (i = avx2_iters * 16; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++;
|
aux = *inputVectorPtr++;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -238,10 +244,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
|
|||||||
const __m128 vmin_val = _mm_set_ps1(min_val);
|
const __m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
const __m128 vmax_val = _mm_set_ps1(max_val);
|
const __m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_load_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
|
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
||||||
|
|
||||||
// Clip
|
// Clip
|
||||||
@ -257,12 +265,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
|
|||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 8; i < num_points * 2; i++)
|
for (i = sse_iters * 8; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++;
|
aux = *inputVectorPtr++;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -289,10 +297,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
|
|||||||
const __m128 vmin_val = _mm_set_ps1(min_val);
|
const __m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
const __m128 vmax_val = _mm_set_ps1(max_val);
|
const __m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_load_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
|
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
||||||
|
|
||||||
// Clip
|
// Clip
|
||||||
@ -308,12 +318,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
|
|||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 8; i < num_points * 2; i++)
|
for (i = sse_iters * 8; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++;
|
aux = *inputVectorPtr++;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -332,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
|
|||||||
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
int16_t* outputVectorPtr = (int16_t*)outputVector;
|
||||||
float aux;
|
float aux;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
|
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
|
||||||
const float max_val = (float)SHRT_MAX;
|
const float max_val = (float)SHRT_MAX;
|
||||||
|
|
||||||
__m256 inputVal1, inputVal2;
|
__m256 inputVal1, inputVal2;
|
||||||
@ -341,10 +351,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
|
|||||||
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
||||||
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
||||||
|
|
||||||
for(i = 0; i < avx2_iters; i++)
|
for (i = 0; i < avx2_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVectorPtr += 8;
|
||||||
|
inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 8;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
|
||||||
|
|
||||||
// Clip
|
// Clip
|
||||||
@ -361,12 +373,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
|
|||||||
outputVectorPtr += 16;
|
outputVectorPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = avx2_iters * 16; i < num_points * 2; i++)
|
for (i = avx2_iters * 16; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++;
|
aux = *inputVectorPtr++;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -397,10 +409,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
|
|||||||
int16x4_t intInputVal1, intInputVal2;
|
int16x4_t intInputVal1, intInputVal2;
|
||||||
int16x8_t res;
|
int16x8_t res;
|
||||||
|
|
||||||
for(i = 0; i < neon_iters; i++)
|
for (i = 0; i < neon_iters; i++)
|
||||||
{
|
{
|
||||||
a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
|
a = vld1q_f32((const float32_t*)(inputVectorPtr));
|
||||||
b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
|
b = vld1q_f32((const float32_t*)(inputVectorPtr));
|
||||||
|
inputVectorPtr += 4;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
|
||||||
|
|
||||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||||
@ -425,12 +439,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
|
|||||||
outputVectorPtr += 8;
|
outputVectorPtr += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = neon_iters * 8; i < num_points * 2; i++)
|
for (i = neon_iters * 8; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++;
|
aux = *inputVectorPtr++;
|
||||||
if(aux > max_val_f)
|
if (aux > max_val_f)
|
||||||
aux = max_val_f;
|
aux = max_val_f;
|
||||||
else if(aux < min_val_f)
|
else if (aux < min_val_f)
|
||||||
aux = min_val_f;
|
aux = min_val_f;
|
||||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -449,14 +463,14 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
|
|||||||
const float max_val = (float)SHRT_MAX;
|
const float max_val = (float)SHRT_MAX;
|
||||||
float aux;
|
float aux;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i = 0; i < num_points * 2; i++)
|
for (i = 0; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++;
|
aux = *inputVectorPtr++;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int16_t)rintf(aux);
|
*outputVectorPtr++ = (int16_t)rintf(aux);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_GENERIC */
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
106
src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
Executable file → Normal file
106
src/algorithms/libs/volk_gnsssdr_module/volk_gnsssdr/kernels/volk_gnsssdr/volk_gnsssdr_32fc_convert_8ic.h
Executable file → Normal file
@ -72,12 +72,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
|
|||||||
const float max_val = (float)SCHAR_MAX;
|
const float max_val = (float)SCHAR_MAX;
|
||||||
float aux;
|
float aux;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i = 0; i < num_points * 2; i++)
|
for (i = 0; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++ * max_val;
|
aux = *inputVectorPtr++ * max_val;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int8_t)rintf(aux);
|
*outputVectorPtr++ = (int8_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -107,12 +107,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector,
|
|||||||
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
||||||
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
||||||
|
|
||||||
for(i = 0; i < avx2_iters; i++)
|
for (i = 0; i < avx2_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVectorPtr += 8;
|
||||||
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
|
||||||
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVectorPtr += 8;
|
||||||
|
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 8;
|
||||||
|
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 8;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
|
||||||
|
|
||||||
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
|
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
|
||||||
@ -142,12 +146,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector,
|
|||||||
outputVectorPtr += 32;
|
outputVectorPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = avx2_iters * 32; i < num_points * 2; i++)
|
for (i = avx2_iters * 32; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++ * max_val;
|
aux = *inputVectorPtr++ * max_val;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int8_t)rintf(aux);
|
*outputVectorPtr++ = (int8_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -177,12 +181,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector,
|
|||||||
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
const __m256 vmin_val = _mm256_set1_ps(min_val);
|
||||||
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
const __m256 vmax_val = _mm256_set1_ps(max_val);
|
||||||
|
|
||||||
for(i = 0; i < avx2_iters; i++)
|
for (i = 0; i < avx2_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVectorPtr += 8;
|
||||||
inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
|
||||||
inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
|
inputVectorPtr += 8;
|
||||||
|
inputVal3 = _mm256_load_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 8;
|
||||||
|
inputVal4 = _mm256_load_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 8;
|
||||||
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
|
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
|
||||||
|
|
||||||
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
|
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
|
||||||
@ -212,12 +220,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector,
|
|||||||
outputVectorPtr += 32;
|
outputVectorPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = avx2_iters * 32; i < num_points * 2; i++)
|
for (i = avx2_iters * 32; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++ * max_val;
|
aux = *inputVectorPtr++ * max_val;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int8_t)rintf(aux);
|
*outputVectorPtr++ = (int8_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -247,12 +255,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
|
|||||||
const __m128 vmin_val = _mm_set_ps1(min_val);
|
const __m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
const __m128 vmax_val = _mm_set_ps1(max_val);
|
const __m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
|
||||||
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
|
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
|
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
|
|
||||||
inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
|
inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
|
||||||
inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
|
inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
|
||||||
@ -278,12 +290,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
|
|||||||
outputVectorPtr += 16;
|
outputVectorPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 16; i < num_points * 2; i++)
|
for (i = sse_iters * 16; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++ * max_val;
|
aux = *inputVectorPtr++ * max_val;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int8_t)rintf(aux);
|
*outputVectorPtr++ = (int8_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -313,12 +325,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
|
|||||||
const __m128 vmin_val = _mm_set_ps1(min_val);
|
const __m128 vmin_val = _mm_set_ps1(min_val);
|
||||||
const __m128 vmax_val = _mm_set_ps1(max_val);
|
const __m128 vmax_val = _mm_set_ps1(max_val);
|
||||||
|
|
||||||
for(i = 0; i < sse_iters; i++)
|
for (i = 0; i < sse_iters; i++)
|
||||||
{
|
{
|
||||||
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal1 = _mm_load_ps((float*)inputVectorPtr);
|
||||||
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
|
||||||
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
|
inputVectorPtr += 4;
|
||||||
|
inputVal3 = _mm_load_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
|
inputVal4 = _mm_load_ps((float*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
|
|
||||||
inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
|
inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
|
||||||
inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
|
inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
|
||||||
@ -344,12 +360,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
|
|||||||
outputVectorPtr += 16;
|
outputVectorPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 16; i < num_points * 2; i++)
|
for (i = sse_iters * 16; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++ * max_val;
|
aux = *inputVectorPtr++ * max_val;
|
||||||
if(aux > max_val)
|
if (aux > max_val)
|
||||||
aux = max_val;
|
aux = max_val;
|
||||||
else if(aux < min_val)
|
else if (aux < min_val)
|
||||||
aux = min_val;
|
aux = min_val;
|
||||||
*outputVectorPtr++ = (int8_t)rintf(aux);
|
*outputVectorPtr++ = (int8_t)rintf(aux);
|
||||||
}
|
}
|
||||||
@ -383,9 +399,10 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
|
|||||||
int8x8_t res8_1, res8_2;
|
int8x8_t res8_1, res8_2;
|
||||||
int8x16_t outputVal;
|
int8x16_t outputVal;
|
||||||
|
|
||||||
for(i = 0; i < neon_iters; i++)
|
for (i = 0; i < neon_iters; i++)
|
||||||
{
|
{
|
||||||
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
|
a = vld1q_f32((const float32_t*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
a = vmulq_f32(a, max_val);
|
a = vmulq_f32(a, max_val);
|
||||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
||||||
@ -394,7 +411,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
|
|||||||
toint_a = vcvtq_s32_f32(Round);
|
toint_a = vcvtq_s32_f32(Round);
|
||||||
intInputVal1 = vqmovn_s32(toint_a);
|
intInputVal1 = vqmovn_s32(toint_a);
|
||||||
|
|
||||||
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
|
a = vld1q_f32((const float32_t*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
a = vmulq_f32(a, max_val);
|
a = vmulq_f32(a, max_val);
|
||||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
||||||
@ -406,7 +424,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
|
|||||||
pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2);
|
pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2);
|
||||||
res8_1 = vqmovn_s16(pack16_8_1);
|
res8_1 = vqmovn_s16(pack16_8_1);
|
||||||
|
|
||||||
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
|
a = vld1q_f32((const float32_t*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
a = vmulq_f32(a, max_val);
|
a = vmulq_f32(a, max_val);
|
||||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
||||||
@ -415,7 +434,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
|
|||||||
toint_a = vcvtq_s32_f32(Round);
|
toint_a = vcvtq_s32_f32(Round);
|
||||||
intInputVal1 = vqmovn_s32(toint_a);
|
intInputVal1 = vqmovn_s32(toint_a);
|
||||||
|
|
||||||
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
|
a = vld1q_f32((const float32_t*)inputVectorPtr);
|
||||||
|
inputVectorPtr += 4;
|
||||||
a = vmulq_f32(a, max_val);
|
a = vmulq_f32(a, max_val);
|
||||||
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
|
||||||
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
|
||||||
@ -433,12 +453,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
|
|||||||
outputVectorPtr += 16;
|
outputVectorPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = neon_iters * 16; i < num_points * 2; i++)
|
for (i = neon_iters * 16; i < num_points * 2; i++)
|
||||||
{
|
{
|
||||||
aux = *inputVectorPtr++ * max_val_f;
|
aux = *inputVectorPtr++ * max_val_f;
|
||||||
if(aux > max_val_f)
|
if (aux > max_val_f)
|
||||||
aux = max_val_f;
|
aux = max_val_f;
|
||||||
else if(aux < min_val_f)
|
else if (aux < min_val_f)
|
||||||
aux = min_val_f;
|
aux = min_val_f;
|
||||||
*outputVectorPtr++ = (int8_t)rintf(aux);
|
*outputVectorPtr++ = (int8_t)rintf(aux);
|
||||||
}
|
}
|
||||||
|
@ -42,31 +42,30 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,26 +77,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -107,26 +106,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,26 +136,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,26 +165,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t*
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,26 +194,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t*
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -224,26 +223,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -253,26 +252,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -282,26 +281,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* re
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -311,28 +310,28 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* re
|
|||||||
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
|
||||||
{
|
{
|
||||||
int code_length_chips = 2046;
|
int code_length_chips = 2046;
|
||||||
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
|
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
|
||||||
int num_out_vectors = 3;
|
int num_out_vectors = 3;
|
||||||
float rem_code_phase_chips = -0.234;
|
float rem_code_phase_chips = -0.234;
|
||||||
unsigned int n;
|
unsigned int n;
|
||||||
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
|
float shifts_chips[3] = {-0.1, 0.0, 0.1};
|
||||||
|
|
||||||
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
|
||||||
|
|
||||||
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_out_vectors; n++)
|
for (n = 0; n < num_out_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(result_aux[n]);
|
volk_gnsssdr_free(result_aux[n]);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_free(result_aux);
|
volk_gnsssdr_free(result_aux);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H
|
#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H
|
||||||
|
@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
result[n_vec] = lv_cmake(0,0);
|
result[n_vec] = lv_cmake(0, 0);
|
||||||
}
|
}
|
||||||
for (n = 0; n < num_points; n++)
|
for (n = 0; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
|
||||||
|
|
||||||
// Regenerate phase
|
// Regenerate phase
|
||||||
if (n % 256 == 0)
|
if (n % 256 == 0)
|
||||||
@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
|
|||||||
unsigned int j;
|
unsigned int j;
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
result[n_vec] = lv_cmake(0,0);
|
result[n_vec] = lv_cmake(0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
|
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
|
||||||
@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
|
|||||||
result[n_vec] += tmp32_2;
|
result[n_vec] += tmp32_2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/* Regenerate phase */
|
/* Regenerate phase */
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
(*phase) /= std::abs((*phase));
|
(*phase) /= std::abs((*phase));
|
||||||
#else
|
#else
|
||||||
@ -169,7 +169,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
|
|||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t dotProduct = lv_cmake(0,0);
|
lv_32fc_t dotProduct = lv_cmake(0, 0);
|
||||||
lv_32fc_t tmp32_1, tmp32_2;
|
lv_32fc_t tmp32_1, tmp32_2;
|
||||||
const unsigned int sse_iters = num_points / 2;
|
const unsigned int sse_iters = num_points / 2;
|
||||||
int n_vec;
|
int n_vec;
|
||||||
@ -179,7 +179,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
const lv_32fc_t** _in_a = in_a;
|
const lv_32fc_t** _in_a = in_a;
|
||||||
const lv_32fc_t* _in_common = in_common;
|
const lv_32fc_t* _in_common = in_common;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t dotProductVector[2];
|
||||||
|
|
||||||
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
|
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
@ -191,11 +192,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
// phase rotation registers
|
// phase rotation registers
|
||||||
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
|
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_inc[2];
|
||||||
two_phase_inc[0] = phase_inc * phase_inc;
|
two_phase_inc[0] = phase_inc * phase_inc;
|
||||||
two_phase_inc[1] = phase_inc * phase_inc;
|
two_phase_inc[1] = phase_inc * phase_inc;
|
||||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_acc[2];
|
||||||
two_phase_acc[0] = (*phase);
|
two_phase_acc[0] = (*phase);
|
||||||
two_phase_acc[1] = (*phase) * phase_inc;
|
two_phase_acc[1] = (*phase) * phase_inc;
|
||||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||||
@ -203,12 +206,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
|
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
|
||||||
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
|
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
a = _mm_loadu_ps((float*)_in_common);
|
a = _mm_loadu_ps((float*)_in_common);
|
||||||
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
|
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg);
|
yh = _mm_movehdup_ps(two_phase_acc_reg);
|
||||||
tmp1 = _mm_mul_ps(a, yl);
|
tmp1 = _mm_mul_ps(a, yl);
|
||||||
tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp);
|
tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp);
|
||||||
@ -219,7 +222,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
z1 = _mm_addsub_ps(tmp1, tmp2);
|
z1 = _mm_addsub_ps(tmp1, tmp2);
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p);
|
two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p);
|
||||||
|
|
||||||
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(z1);
|
yh = _mm_movehdup_ps(z1);
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
@ -227,7 +230,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_loadu_ps((float*)&(_in_a[n_vec][number*2]));
|
a = _mm_loadu_ps((float*)&(_in_a[n_vec][number * 2]));
|
||||||
tmp1 = _mm_mul_ps(a, yl);
|
tmp1 = _mm_mul_ps(a, yl);
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1);
|
a = _mm_shuffle_ps(a, a, 0xB1);
|
||||||
tmp2 = _mm_mul_ps(a, yh);
|
tmp2 = _mm_mul_ps(a, yh);
|
||||||
@ -247,8 +250,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
|
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (i = 0; i < 2; ++i)
|
for (i = 0; i < 2; ++i)
|
||||||
{
|
{
|
||||||
dotProduct = dotProduct + dotProductVector[i];
|
dotProduct = dotProduct + dotProductVector[i];
|
||||||
@ -260,7 +263,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||||
(*phase) = two_phase_acc[0];
|
(*phase) = two_phase_acc[0];
|
||||||
|
|
||||||
for(n = sse_iters * 2; n < num_points; n++)
|
for (n = sse_iters * 2; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = in_common[n] * (*phase);
|
tmp32_1 = in_common[n] * (*phase);
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -278,7 +281,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
|
|||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t dotProduct = lv_cmake(0,0);
|
lv_32fc_t dotProduct = lv_cmake(0, 0);
|
||||||
lv_32fc_t tmp32_1, tmp32_2;
|
lv_32fc_t tmp32_1, tmp32_2;
|
||||||
const unsigned int sse_iters = num_points / 2;
|
const unsigned int sse_iters = num_points / 2;
|
||||||
int n_vec;
|
int n_vec;
|
||||||
@ -288,7 +291,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
const lv_32fc_t** _in_a = in_a;
|
const lv_32fc_t** _in_a = in_a;
|
||||||
const lv_32fc_t* _in_common = in_common;
|
const lv_32fc_t* _in_common = in_common;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t dotProductVector[2];
|
||||||
|
|
||||||
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
|
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
@ -300,11 +304,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
// phase rotation registers
|
// phase rotation registers
|
||||||
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
|
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_inc[2];
|
||||||
two_phase_inc[0] = phase_inc * phase_inc;
|
two_phase_inc[0] = phase_inc * phase_inc;
|
||||||
two_phase_inc[1] = phase_inc * phase_inc;
|
two_phase_inc[1] = phase_inc * phase_inc;
|
||||||
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
|
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_32fc_t two_phase_acc[2];
|
||||||
two_phase_acc[0] = (*phase);
|
two_phase_acc[0] = (*phase);
|
||||||
two_phase_acc[1] = (*phase) * phase_inc;
|
two_phase_acc[1] = (*phase) * phase_inc;
|
||||||
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
|
||||||
@ -312,12 +318,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
|
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
|
||||||
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
|
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
a = _mm_load_ps((float*)_in_common);
|
a = _mm_load_ps((float*)_in_common);
|
||||||
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
|
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
|
||||||
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(two_phase_acc_reg);
|
yh = _mm_movehdup_ps(two_phase_acc_reg);
|
||||||
tmp1 = _mm_mul_ps(a, yl);
|
tmp1 = _mm_mul_ps(a, yl);
|
||||||
tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp);
|
tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp);
|
||||||
@ -328,7 +334,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
z1 = _mm_addsub_ps(tmp1, tmp2);
|
z1 = _mm_addsub_ps(tmp1, tmp2);
|
||||||
two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p);
|
two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p);
|
||||||
|
|
||||||
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
|
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm_movehdup_ps(z1);
|
yh = _mm_movehdup_ps(z1);
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
@ -336,7 +342,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
a = _mm_load_ps((float*)&(_in_a[n_vec][number*2]));
|
a = _mm_load_ps((float*)&(_in_a[n_vec][number * 2]));
|
||||||
tmp1 = _mm_mul_ps(a, yl);
|
tmp1 = _mm_mul_ps(a, yl);
|
||||||
a = _mm_shuffle_ps(a, a, 0xB1);
|
a = _mm_shuffle_ps(a, a, 0xB1);
|
||||||
tmp2 = _mm_mul_ps(a, yh);
|
tmp2 = _mm_mul_ps(a, yh);
|
||||||
@ -356,8 +362,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
|
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (i = 0; i < 2; ++i)
|
for (i = 0; i < 2; ++i)
|
||||||
{
|
{
|
||||||
dotProduct = dotProduct + dotProductVector[i];
|
dotProduct = dotProduct + dotProductVector[i];
|
||||||
@ -369,7 +375,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
|
||||||
(*phase) = two_phase_acc[0];
|
(*phase) = two_phase_acc[0];
|
||||||
|
|
||||||
for(n = sse_iters * 2; n < num_points; n++)
|
for (n = sse_iters * 2; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = in_common[n] * (*phase);
|
tmp32_1 = in_common[n] * (*phase);
|
||||||
(*phase) *= phase_inc;
|
(*phase) *= phase_inc;
|
||||||
@ -387,7 +393,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
|
|||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t dotProduct = lv_cmake(0,0);
|
lv_32fc_t dotProduct = lv_cmake(0, 0);
|
||||||
lv_32fc_t tmp32_1, tmp32_2;
|
lv_32fc_t tmp32_1, tmp32_2;
|
||||||
const unsigned int avx_iters = num_points / 4;
|
const unsigned int avx_iters = num_points / 4;
|
||||||
int n_vec;
|
int n_vec;
|
||||||
@ -398,7 +404,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
|||||||
const lv_32fc_t* _in_common = in_common;
|
const lv_32fc_t* _in_common = in_common;
|
||||||
lv_32fc_t _phase = (*phase);
|
lv_32fc_t _phase = (*phase);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_32fc_t dotProductVector[4];
|
||||||
|
|
||||||
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
|
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
@ -431,12 +438,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
|||||||
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
|
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
|
||||||
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
|
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
|
||||||
|
|
||||||
for(number = 0; number < avx_iters; number++)
|
for (number = 0; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
a = _mm256_loadu_ps((float*)_in_common);
|
a = _mm256_loadu_ps((float*)_in_common);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
||||||
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm256_movehdup_ps(four_phase_acc_reg);
|
yh = _mm256_movehdup_ps(four_phase_acc_reg);
|
||||||
tmp1 = _mm256_mul_ps(a, yl);
|
tmp1 = _mm256_mul_ps(a, yl);
|
||||||
tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp);
|
tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp);
|
||||||
@ -447,7 +454,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
|||||||
z = _mm256_addsub_ps(tmp1, tmp2);
|
z = _mm256_addsub_ps(tmp1, tmp2);
|
||||||
four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p);
|
four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p);
|
||||||
|
|
||||||
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
|
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm256_movehdup_ps(z);
|
yh = _mm256_movehdup_ps(z);
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
@ -475,8 +482,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
|
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
dotProduct = dotProduct + dotProductVector[i];
|
dotProduct = dotProduct + dotProductVector[i];
|
||||||
@ -492,10 +499,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
|||||||
four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2);
|
four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2);
|
||||||
|
|
||||||
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
||||||
_phase = four_phase_acc[0];
|
_phase = four_phase_acc[0];
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
|
|
||||||
for(n = avx_iters * 4; n < num_points; n++)
|
for (n = avx_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = *_in_common++ * _phase;
|
tmp32_1 = *_in_common++ * _phase;
|
||||||
_phase *= phase_inc;
|
_phase *= phase_inc;
|
||||||
@ -514,7 +521,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
|
|||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t dotProduct = lv_cmake(0,0);
|
lv_32fc_t dotProduct = lv_cmake(0, 0);
|
||||||
lv_32fc_t tmp32_1, tmp32_2;
|
lv_32fc_t tmp32_1, tmp32_2;
|
||||||
const unsigned int avx_iters = num_points / 4;
|
const unsigned int avx_iters = num_points / 4;
|
||||||
int n_vec;
|
int n_vec;
|
||||||
@ -525,7 +532,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
|||||||
const lv_32fc_t* _in_common = in_common;
|
const lv_32fc_t* _in_common = in_common;
|
||||||
lv_32fc_t _phase = (*phase);
|
lv_32fc_t _phase = (*phase);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_32fc_t dotProductVector[4];
|
||||||
|
|
||||||
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
|
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
@ -538,7 +546,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
|||||||
// phase rotation registers
|
// phase rotation registers
|
||||||
__m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
|
__m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_32fc_t four_phase_inc[4];
|
||||||
const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
|
const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
|
||||||
const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
|
const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
|
||||||
const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
|
const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
|
||||||
@ -548,7 +557,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
|||||||
four_phase_inc[3] = phase_inc4;
|
four_phase_inc[3] = phase_inc4;
|
||||||
const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
|
const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_32fc_t four_phase_acc[4];
|
||||||
four_phase_acc[0] = _phase;
|
four_phase_acc[0] = _phase;
|
||||||
four_phase_acc[1] = _phase * phase_inc;
|
four_phase_acc[1] = _phase * phase_inc;
|
||||||
four_phase_acc[2] = _phase * phase_inc2;
|
four_phase_acc[2] = _phase * phase_inc2;
|
||||||
@ -558,12 +568,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
|||||||
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
|
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
|
||||||
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
|
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
|
||||||
|
|
||||||
for(number = 0; number < avx_iters; number++)
|
for (number = 0; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
// Phase rotation on operand in_common starts here:
|
// Phase rotation on operand in_common starts here:
|
||||||
a = _mm256_load_ps((float*)_in_common);
|
a = _mm256_load_ps((float*)_in_common);
|
||||||
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
|
||||||
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm256_movehdup_ps(four_phase_acc_reg);
|
yh = _mm256_movehdup_ps(four_phase_acc_reg);
|
||||||
tmp1 = _mm256_mul_ps(a, yl);
|
tmp1 = _mm256_mul_ps(a, yl);
|
||||||
tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp);
|
tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp);
|
||||||
@ -574,7 +584,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
|||||||
z = _mm256_addsub_ps(tmp1, tmp2);
|
z = _mm256_addsub_ps(tmp1, tmp2);
|
||||||
four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p);
|
four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p);
|
||||||
|
|
||||||
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
|
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
|
||||||
yh = _mm256_movehdup_ps(z);
|
yh = _mm256_movehdup_ps(z);
|
||||||
|
|
||||||
//next two samples
|
//next two samples
|
||||||
@ -602,8 +612,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
|||||||
|
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
|
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
dotProduct = dotProduct + dotProductVector[i];
|
dotProduct = dotProduct + dotProductVector[i];
|
||||||
@ -619,10 +629,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
|
|||||||
four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2);
|
four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2);
|
||||||
|
|
||||||
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
|
||||||
_phase = four_phase_acc[0];
|
_phase = four_phase_acc[0];
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
|
|
||||||
for(n = avx_iters * 4; n < num_points; n++)
|
for (n = avx_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = *_in_common++ * _phase;
|
tmp32_1 = *_in_common++ * _phase;
|
||||||
_phase *= phase_inc;
|
_phase *= phase_inc;
|
||||||
@ -646,7 +656,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
|
|||||||
int n_vec;
|
int n_vec;
|
||||||
int i;
|
int i;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int n ;
|
unsigned int n;
|
||||||
const lv_32fc_t** _in_a = in_a;
|
const lv_32fc_t** _in_a = in_a;
|
||||||
const lv_32fc_t* _in_common = in_common;
|
const lv_32fc_t* _in_common = in_common;
|
||||||
lv_32fc_t* _out = result;
|
lv_32fc_t* _out = result;
|
||||||
@ -656,36 +666,41 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
|
|||||||
|
|
||||||
if (neon_iters > 0)
|
if (neon_iters > 0)
|
||||||
{
|
{
|
||||||
lv_32fc_t dotProduct = lv_cmake(0,0);
|
lv_32fc_t dotProduct = lv_cmake(0, 0);
|
||||||
float32_t arg_phase0 = cargf(_phase);
|
float32_t arg_phase0 = cargf(_phase);
|
||||||
float32_t arg_phase_inc = cargf(phase_inc);
|
float32_t arg_phase_inc = cargf(phase_inc);
|
||||||
float32_t phase_est;
|
float32_t phase_est;
|
||||||
|
|
||||||
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
|
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
|
||||||
|
|
||||||
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
|
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
|
||||||
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
|
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
|
||||||
|
|
||||||
lv_32fc_t phase2 = (lv_32fc_t)(_phase) * phase_inc;
|
lv_32fc_t phase2 = (lv_32fc_t)(_phase)*phase_inc;
|
||||||
lv_32fc_t phase3 = phase2 * phase_inc;
|
lv_32fc_t phase3 = phase2 * phase_inc;
|
||||||
lv_32fc_t phase4 = phase3 * phase_inc;
|
lv_32fc_t phase4 = phase3 * phase_inc;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
|
float32_t __phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t __phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
|
||||||
|
|
||||||
float32x4_t _phase_real = vld1q_f32(__phase_real);
|
float32x4_t _phase_real = vld1q_f32(__phase_real);
|
||||||
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
|
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
lv_32fc_t dotProductVector[4];
|
||||||
|
|
||||||
float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag;
|
float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag;
|
||||||
|
|
||||||
float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
|
float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
|
||||||
float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
|
float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
|
||||||
|
|
||||||
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f);
|
accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f);
|
||||||
accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f);
|
accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f);
|
||||||
@ -693,7 +708,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
|
|||||||
accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f);
|
accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(number = 0; number < neon_iters; number++)
|
for (number = 0; number < neon_iters; number++)
|
||||||
{
|
{
|
||||||
/* load 4 complex numbers (float 32 bits each component) */
|
/* load 4 complex numbers (float 32 bits each component) */
|
||||||
b_val = vld2q_f32((float32_t*)_in_common);
|
b_val = vld2q_f32((float32_t*)_in_common);
|
||||||
@ -728,8 +743,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
|
|||||||
phase3 = phase2 * phase_inc;
|
phase3 = phase2 * phase_inc;
|
||||||
phase4 = phase3 * phase_inc;
|
phase4 = phase3 * phase_inc;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
|
float32_t ____phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t ____phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
|
||||||
|
|
||||||
_phase_real = vld1q_f32(____phase_real);
|
_phase_real = vld1q_f32(____phase_real);
|
||||||
_phase_imag = vld1q_f32(____phase_imag);
|
_phase_imag = vld1q_f32(____phase_imag);
|
||||||
@ -753,8 +770,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
|
|||||||
}
|
}
|
||||||
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
|
||||||
{
|
{
|
||||||
vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
|
vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
for (i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
dotProduct = dotProduct + dotProductVector[i];
|
dotProduct = dotProduct + dotProductVector[i];
|
||||||
@ -770,7 +787,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
|
|||||||
_phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
|
_phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(n = neon_iters * 4; n < num_points; n++)
|
for (n = neon_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
tmp32_1 = in_common[n] * _phase;
|
tmp32_1 = in_common[n] * _phase;
|
||||||
_phase *= phase_inc;
|
_phase *= phase_inc;
|
||||||
@ -786,4 +803,3 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
|
|||||||
#endif /* LV_HAVE_NEON */
|
#endif /* LV_HAVE_NEON */
|
||||||
|
|
||||||
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */
|
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -101,7 +101,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -113,14 +113,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -131,7 +131,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -143,14 +143,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -161,7 +161,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX
|
#ifdef LV_HAVE_AVX
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -173,14 +173,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX
|
#ifdef LV_HAVE_AVX
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -203,14 +203,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
@ -221,7 +221,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
|
||||||
{
|
{
|
||||||
// phases must be normalized. Phase rotator expects a complex exponential input!
|
// phases must be normalized. Phase rotator expects a complex exponential input!
|
||||||
float rem_carrier_phase_in_rad = 0.25;
|
float rem_carrier_phase_in_rad = 0.25;
|
||||||
@ -233,14 +233,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32f
|
|||||||
unsigned int n;
|
unsigned int n;
|
||||||
int num_a_vectors = 3;
|
int num_a_vectors = 3;
|
||||||
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
|
||||||
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
|
||||||
}
|
}
|
||||||
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
|
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
|
||||||
|
|
||||||
for(n = 0; n < num_a_vectors; n++)
|
for (n = 0; n < num_a_vectors; n++)
|
||||||
{
|
{
|
||||||
volk_gnsssdr_free(in_a[n]);
|
volk_gnsssdr_free(in_a[n]);
|
||||||
}
|
}
|
||||||
|
@ -107,7 +107,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -121,7 +122,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -142,18 +143,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -177,7 +178,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -191,7 +193,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -212,18 +214,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -245,7 +247,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -259,7 +262,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -277,18 +280,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -311,7 +314,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
|
|||||||
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
|
||||||
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int local_code_chip_index[4];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m128i zeros = _mm_setzero_si128();
|
const __m128i zeros = _mm_setzero_si128();
|
||||||
@ -325,7 +329,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
|
|||||||
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
|
||||||
for(n = 0; n < quarterPoints; n++)
|
for (n = 0; n < quarterPoints; n++)
|
||||||
{
|
{
|
||||||
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
|
||||||
aux = _mm_add_ps(aux, aux2);
|
aux = _mm_add_ps(aux, aux2);
|
||||||
@ -343,18 +347,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
|
|||||||
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
|
||||||
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
|
||||||
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = _mm_add_ps(indexn, fours);
|
indexn = _mm_add_ps(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = quarterPoints * 4; n < num_points; n++)
|
for (n = quarterPoints * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -377,7 +381,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -392,7 +397,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -410,13 +415,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -426,12 +431,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -454,7 +459,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -469,7 +475,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -487,13 +493,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -503,12 +509,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -531,7 +537,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -546,7 +553,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -565,13 +572,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -581,12 +588,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -609,7 +616,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
|
|||||||
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
|
||||||
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
int local_code_chip_index[8];
|
||||||
int local_code_chip_index_;
|
int local_code_chip_index_;
|
||||||
|
|
||||||
const __m256 zeros = _mm256_setzero_ps();
|
const __m256 zeros = _mm256_setzero_ps();
|
||||||
@ -624,7 +632,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
|
|||||||
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < avx_iters; n++)
|
for (n = 0; n < avx_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
|
||||||
@ -643,13 +651,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
|
|||||||
|
|
||||||
// no negatives
|
// no negatives
|
||||||
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
|
||||||
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
|
negatives = _mm256_cmp_ps(c, zeros, 0x01);
|
||||||
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
|
||||||
aux = _mm256_add_ps(c, aux3);
|
aux = _mm256_add_ps(c, aux3);
|
||||||
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
for(k = 0; k < 8; ++k)
|
for (k = 0; k < 8; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
@ -659,12 +667,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
|
|||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
{
|
{
|
||||||
for(n = avx_iters * 8; n < num_points; n++)
|
for (n = avx_iters * 8; n < num_points; n++)
|
||||||
{
|
{
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
|
||||||
//Take into account that in multitap correlators, the shifts can be negative!
|
//Take into account that in multitap correlators, the shifts can be negative!
|
||||||
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
|
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
|
||||||
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
|
||||||
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
|
||||||
}
|
}
|
||||||
@ -689,19 +697,21 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
|
|||||||
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
|
||||||
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
int32_t local_code_chip_index[4];
|
||||||
int32_t local_code_chip_index_;
|
int32_t local_code_chip_index_;
|
||||||
|
|
||||||
const int32x4_t zeros = vdupq_n_s32(0);
|
const int32x4_t zeros = vdupq_n_s32(0);
|
||||||
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
|
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
|
||||||
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
|
||||||
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
|
||||||
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
|
||||||
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
|
||||||
uint32x4_t igx;
|
uint32x4_t igx;
|
||||||
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
|
||||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
|
||||||
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
|
||||||
float32x4_t n0 = vld1q_f32((float*)vec);
|
float32x4_t n0 = vld1q_f32((float*)vec);
|
||||||
|
|
||||||
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
|
||||||
@ -709,7 +719,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
|
|||||||
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
|
||||||
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
|
||||||
indexn = n0;
|
indexn = n0;
|
||||||
for(n = 0; n < neon_iters; n++)
|
for (n = 0; n < neon_iters; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
|
||||||
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
|
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
|
||||||
@ -725,7 +735,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
|
|||||||
|
|
||||||
// fmod
|
// fmod
|
||||||
c = vmulq_f32(aux, reciprocal);
|
c = vmulq_f32(aux, reciprocal);
|
||||||
i = vcvtq_s32_f32(c);
|
i = vcvtq_s32_f32(c);
|
||||||
cTrunc = vcvtq_f32_s32(i);
|
cTrunc = vcvtq_f32_s32(i);
|
||||||
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
|
||||||
aux = vsubq_f32(aux, base);
|
aux = vsubq_f32(aux, base);
|
||||||
@ -737,13 +747,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
|
|||||||
|
|
||||||
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
|
||||||
|
|
||||||
for(k = 0; k < 4; ++k)
|
for (k = 0; k < 4; ++k)
|
||||||
{
|
{
|
||||||
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
|
||||||
}
|
}
|
||||||
indexn = vaddq_f32(indexn, fours);
|
indexn = vaddq_f32(indexn, fours);
|
||||||
}
|
}
|
||||||
for(n = neon_iters * 4; n < num_points; n++)
|
for (n = neon_iters * 4; n < num_points; n++)
|
||||||
{
|
{
|
||||||
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
|
||||||
// resample code for current tap
|
// resample code for current tap
|
||||||
|
@ -69,11 +69,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
|
|||||||
unsigned int i;
|
unsigned int i;
|
||||||
const double* aPtr = inputBuffer;
|
const double* aPtr = inputBuffer;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
double tempBuffer[4];
|
||||||
__m256d accumulator = _mm256_setzero_pd();
|
__m256d accumulator = _mm256_setzero_pd();
|
||||||
__m256d aVal = _mm256_setzero_pd();
|
__m256d aVal = _mm256_setzero_pd();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm256_loadu_pd(aPtr);
|
aVal = _mm256_loadu_pd(aPtr);
|
||||||
accumulator = _mm256_add_pd(accumulator, aVal);
|
accumulator = _mm256_add_pd(accumulator, aVal);
|
||||||
@ -82,12 +83,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
|
|||||||
|
|
||||||
_mm256_storeu_pd((double*)tempBuffer, accumulator);
|
_mm256_storeu_pd((double*)tempBuffer, accumulator);
|
||||||
|
|
||||||
for(i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
returnValue += tempBuffer[i];
|
returnValue += tempBuffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < (num_points % 4); ++i)
|
for (i = 0; i < (num_points % 4); ++i)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
@ -100,7 +101,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
|
|||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points)
|
static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result, const double* inputBuffer, unsigned int num_points)
|
||||||
{
|
{
|
||||||
double returnValue = 0;
|
double returnValue = 0;
|
||||||
const unsigned int sse_iters = num_points / 2;
|
const unsigned int sse_iters = num_points / 2;
|
||||||
@ -108,11 +109,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
|
|||||||
unsigned int i;
|
unsigned int i;
|
||||||
const double* aPtr = inputBuffer;
|
const double* aPtr = inputBuffer;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
double tempBuffer[2];
|
||||||
__m128d accumulator = _mm_setzero_pd();
|
__m128d accumulator = _mm_setzero_pd();
|
||||||
__m128d aVal = _mm_setzero_pd();
|
__m128d aVal = _mm_setzero_pd();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm_loadu_pd(aPtr);
|
aVal = _mm_loadu_pd(aPtr);
|
||||||
accumulator = _mm_add_pd(accumulator, aVal);
|
accumulator = _mm_add_pd(accumulator, aVal);
|
||||||
@ -121,12 +123,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
|
|||||||
|
|
||||||
_mm_storeu_pd((double*)tempBuffer, accumulator);
|
_mm_storeu_pd((double*)tempBuffer, accumulator);
|
||||||
|
|
||||||
for(i = 0; i < 2; ++i)
|
for (i = 0; i < 2; ++i)
|
||||||
{
|
{
|
||||||
returnValue += tempBuffer[i];
|
returnValue += tempBuffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < (num_points % 2); ++i)
|
for (i = 0; i < (num_points % 2); ++i)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
@ -138,13 +140,13 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
|
|||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
|
|
||||||
static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points)
|
static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result, const double* inputBuffer, unsigned int num_points)
|
||||||
{
|
{
|
||||||
const double* aPtr = inputBuffer;
|
const double* aPtr = inputBuffer;
|
||||||
double returnValue = 0;
|
double returnValue = 0;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
for(number = 0; number < num_points; number++)
|
for (number = 0; number < num_points; number++)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
@ -156,7 +158,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const
|
|||||||
#ifdef LV_HAVE_AVX
|
#ifdef LV_HAVE_AVX
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points)
|
static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result, const double* inputBuffer, unsigned int num_points)
|
||||||
{
|
{
|
||||||
double returnValue = 0;
|
double returnValue = 0;
|
||||||
const unsigned int sse_iters = num_points / 4;
|
const unsigned int sse_iters = num_points / 4;
|
||||||
@ -164,11 +166,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
|
|||||||
unsigned int i;
|
unsigned int i;
|
||||||
const double* aPtr = inputBuffer;
|
const double* aPtr = inputBuffer;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
double tempBuffer[4];
|
||||||
__m256d accumulator = _mm256_setzero_pd();
|
__m256d accumulator = _mm256_setzero_pd();
|
||||||
__m256d aVal = _mm256_setzero_pd();
|
__m256d aVal = _mm256_setzero_pd();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm256_load_pd(aPtr);
|
aVal = _mm256_load_pd(aPtr);
|
||||||
accumulator = _mm256_add_pd(accumulator, aVal);
|
accumulator = _mm256_add_pd(accumulator, aVal);
|
||||||
@ -177,12 +180,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
|
|||||||
|
|
||||||
_mm256_store_pd((double*)tempBuffer, accumulator);
|
_mm256_store_pd((double*)tempBuffer, accumulator);
|
||||||
|
|
||||||
for(i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
{
|
{
|
||||||
returnValue += tempBuffer[i];
|
returnValue += tempBuffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < (num_points % 4); ++i)
|
for (i = 0; i < (num_points % 4); ++i)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
@ -195,7 +198,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
|
|||||||
#ifdef LV_HAVE_SSE3
|
#ifdef LV_HAVE_SSE3
|
||||||
#include <pmmintrin.h>
|
#include <pmmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points)
|
static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result, const double* inputBuffer, unsigned int num_points)
|
||||||
{
|
{
|
||||||
double returnValue = 0;
|
double returnValue = 0;
|
||||||
const unsigned int sse_iters = num_points / 2;
|
const unsigned int sse_iters = num_points / 2;
|
||||||
@ -203,11 +206,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
|
|||||||
unsigned int i;
|
unsigned int i;
|
||||||
const double* aPtr = inputBuffer;
|
const double* aPtr = inputBuffer;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
double tempBuffer[2];
|
||||||
__m128d accumulator = _mm_setzero_pd();
|
__m128d accumulator = _mm_setzero_pd();
|
||||||
__m128d aVal = _mm_setzero_pd();
|
__m128d aVal = _mm_setzero_pd();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm_load_pd(aPtr);
|
aVal = _mm_load_pd(aPtr);
|
||||||
accumulator = _mm_add_pd(accumulator, aVal);
|
accumulator = _mm_add_pd(accumulator, aVal);
|
||||||
@ -216,12 +220,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
|
|||||||
|
|
||||||
_mm_store_pd((double*)tempBuffer, accumulator);
|
_mm_store_pd((double*)tempBuffer, accumulator);
|
||||||
|
|
||||||
for(i = 0; i < 2; ++i)
|
for (i = 0; i < 2; ++i)
|
||||||
{
|
{
|
||||||
returnValue += tempBuffer[i];
|
returnValue += tempBuffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < (num_points % 2); ++i)
|
for (i = 0; i < (num_points % 2); ++i)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
|
@ -70,11 +70,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
|
|||||||
unsigned int i;
|
unsigned int i;
|
||||||
const char* aPtr = inputBuffer;
|
const char* aPtr = inputBuffer;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char tempBuffer[16];
|
||||||
__m128i accumulator = _mm_setzero_si128();
|
__m128i accumulator = _mm_setzero_si128();
|
||||||
__m128i aVal = _mm_setzero_si128();
|
__m128i aVal = _mm_setzero_si128();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm_lddqu_si128((__m128i*)aPtr);
|
aVal = _mm_lddqu_si128((__m128i*)aPtr);
|
||||||
accumulator = _mm_add_epi8(accumulator, aVal);
|
accumulator = _mm_add_epi8(accumulator, aVal);
|
||||||
@ -82,12 +83,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
|
|||||||
}
|
}
|
||||||
_mm_storeu_si128((__m128i*)tempBuffer, accumulator);
|
_mm_storeu_si128((__m128i*)tempBuffer, accumulator);
|
||||||
|
|
||||||
for(i = 0; i < 16; ++i)
|
for (i = 0; i < 16; ++i)
|
||||||
{
|
{
|
||||||
returnValue += tempBuffer[i];
|
returnValue += tempBuffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < (num_points % 16); ++i)
|
for (i = 0; i < (num_points % 16); ++i)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
@ -104,7 +105,7 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c
|
|||||||
const char* aPtr = inputBuffer;
|
const char* aPtr = inputBuffer;
|
||||||
char returnValue = 0;
|
char returnValue = 0;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
for(number = 0;number < num_points; number++)
|
for (number = 0; number < num_points; number++)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
@ -125,24 +126,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
|
|||||||
|
|
||||||
const char* aPtr = inputBuffer;
|
const char* aPtr = inputBuffer;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char tempBuffer[16];
|
||||||
__m128i accumulator = _mm_setzero_si128();
|
__m128i accumulator = _mm_setzero_si128();
|
||||||
__m128i aVal = _mm_setzero_si128();
|
__m128i aVal = _mm_setzero_si128();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm_load_si128((__m128i*)aPtr);
|
aVal = _mm_load_si128((__m128i*)aPtr);
|
||||||
accumulator = _mm_add_epi8(accumulator, aVal);
|
accumulator = _mm_add_epi8(accumulator, aVal);
|
||||||
aPtr += 16;
|
aPtr += 16;
|
||||||
}
|
}
|
||||||
_mm_store_si128((__m128i*)tempBuffer,accumulator);
|
_mm_store_si128((__m128i*)tempBuffer, accumulator);
|
||||||
|
|
||||||
for(i = 0; i < 16; ++i)
|
for (i = 0; i < 16; ++i)
|
||||||
{
|
{
|
||||||
returnValue += tempBuffer[i];
|
returnValue += tempBuffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < (num_points % 16); ++i)
|
for (i = 0; i < (num_points % 16); ++i)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
@ -164,24 +166,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const ch
|
|||||||
|
|
||||||
const char* aPtr = inputBuffer;
|
const char* aPtr = inputBuffer;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
char tempBuffer[32];
|
||||||
__m256i accumulator = _mm256_setzero_si256();
|
__m256i accumulator = _mm256_setzero_si256();
|
||||||
__m256i aVal = _mm256_setzero_si256();
|
__m256i aVal = _mm256_setzero_si256();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm256_load_si256((__m256i*)aPtr);
|
aVal = _mm256_load_si256((__m256i*)aPtr);
|
||||||
accumulator = _mm256_add_epi8(accumulator, aVal);
|
accumulator = _mm256_add_epi8(accumulator, aVal);
|
||||||
aPtr += 32;
|
aPtr += 32;
|
||||||
}
|
}
|
||||||
_mm256_store_si256((__m256i*)tempBuffer,accumulator);
|
_mm256_store_si256((__m256i*)tempBuffer, accumulator);
|
||||||
|
|
||||||
for(i = 0; i < 32; ++i)
|
for (i = 0; i < 32; ++i)
|
||||||
{
|
{
|
||||||
returnValue += tempBuffer[i];
|
returnValue += tempBuffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < (num_points % 32); ++i)
|
for (i = 0; i < (num_points % 32); ++i)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
@ -202,11 +205,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch
|
|||||||
unsigned int i;
|
unsigned int i;
|
||||||
const char* aPtr = inputBuffer;
|
const char* aPtr = inputBuffer;
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
char tempBuffer[32];
|
||||||
__m256i accumulator = _mm256_setzero_si256();
|
__m256i accumulator = _mm256_setzero_si256();
|
||||||
__m256i aVal = _mm256_setzero_si256();
|
__m256i aVal = _mm256_setzero_si256();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm256_lddqu_si256((__m256i*)aPtr);
|
aVal = _mm256_lddqu_si256((__m256i*)aPtr);
|
||||||
accumulator = _mm256_add_epi8(accumulator, aVal);
|
accumulator = _mm256_add_epi8(accumulator, aVal);
|
||||||
@ -214,12 +218,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch
|
|||||||
}
|
}
|
||||||
_mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
|
_mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
|
||||||
|
|
||||||
for(i = 0; i < 32; ++i)
|
for (i = 0; i < 32; ++i)
|
||||||
{
|
{
|
||||||
returnValue += tempBuffer[i];
|
returnValue += tempBuffer[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < (num_points % 32); ++i)
|
for (i = 0; i < (num_points % 32); ++i)
|
||||||
{
|
{
|
||||||
returnValue += (*aPtr++);
|
returnValue += (*aPtr++);
|
||||||
}
|
}
|
||||||
|
@ -60,11 +60,11 @@
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
#include<immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int avx2_iters = num_points / 32;
|
const unsigned int avx2_iters = num_points / 32;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -74,14 +74,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
|
|||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
unsigned int mask;
|
unsigned int mask;
|
||||||
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
char currentValuesBuffer[32];
|
||||||
__m256i maxValues, compareResults, currentValues;
|
__m256i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm256_set1_epi8(max);
|
maxValues = _mm256_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < avx2_iters; number++)
|
for (number = 0; number < avx2_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
|
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
|
||||||
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
|
||||||
mask = _mm256_movemask_epi8(compareResults);
|
mask = _mm256_movemask_epi8(compareResults);
|
||||||
|
|
||||||
@ -94,7 +95,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
|
|||||||
{
|
{
|
||||||
if ((mask & 1) == 1)
|
if ((mask & 1) == 1)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
index = inputPtr - basePtr + i;
|
index = inputPtr - basePtr + i;
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
@ -108,9 +109,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
|
|||||||
inputPtr += 32;
|
inputPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i<(num_points % 32); ++i)
|
for (i = 0; i < (num_points % 32); ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -128,7 +129,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 32;
|
const unsigned int sse_iters = num_points / 32;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -137,33 +138,34 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
|
|||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
char currentValuesBuffer[32];
|
||||||
__m256i ones, compareResults, currentValues;
|
__m256i ones, compareResults, currentValues;
|
||||||
__m128i compareResultslo, compareResultshi, maxValues, lo, hi;
|
__m128i compareResultslo, compareResultshi, maxValues, lo, hi;
|
||||||
|
|
||||||
ones = _mm256_set1_epi8(0xFF);
|
ones = _mm256_set1_epi8(0xFF);
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
|
currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
|
||||||
|
|
||||||
lo = _mm256_castsi256_si128(currentValues);
|
lo = _mm256_castsi256_si128(currentValues);
|
||||||
hi = _mm256_extractf128_si256(currentValues,1);
|
hi = _mm256_extractf128_si256(currentValues, 1);
|
||||||
|
|
||||||
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
|
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
|
||||||
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
|
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
|
||||||
|
|
||||||
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
|
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
|
||||||
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
|
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
|
||||||
|
|
||||||
if (!_mm256_testc_si256(compareResults, ones))
|
if (!_mm256_testc_si256(compareResults, ones))
|
||||||
{
|
{
|
||||||
_mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues);
|
_mm256_storeu_si256((__m256i*)¤tValuesBuffer, currentValues);
|
||||||
|
|
||||||
for(i = 0; i < 32; i++)
|
for (i = 0; i < 32; i++)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
index = inputPtr - basePtr + i;
|
index = inputPtr - basePtr + i;
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
@ -175,9 +177,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
|
|||||||
inputPtr += 32;
|
inputPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i<(num_points % 32); ++i)
|
for (i = 0; i < (num_points % 32); ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -195,7 +197,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 16;
|
const unsigned int sse_iters = num_points / 16;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -204,14 +206,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
|
|||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char currentValuesBuffer[16];
|
||||||
__m128i maxValues, compareResults, currentValues;
|
__m128i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
|
currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
|
||||||
|
|
||||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||||
|
|
||||||
@ -219,9 +222,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
|
|||||||
{
|
{
|
||||||
_mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues);
|
_mm_storeu_si128((__m128i*)¤tValuesBuffer, currentValues);
|
||||||
|
|
||||||
for(i = 0; i < 16; i++)
|
for (i = 0; i < 16; i++)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
index = inputPtr - basePtr + i;
|
index = inputPtr - basePtr + i;
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
@ -233,9 +236,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
|
|||||||
inputPtr += 16;
|
inputPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i<(num_points % 16); ++i)
|
for (i = 0; i < (num_points % 16); ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -249,11 +252,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include<emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 16;
|
const unsigned int sse_iters = num_points / 16;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -263,14 +266,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
|
|||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
unsigned short mask;
|
unsigned short mask;
|
||||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char currentValuesBuffer[16];
|
||||||
__m128i maxValues, compareResults, currentValues;
|
__m128i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
||||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||||
mask = _mm_movemask_epi8(compareResults);
|
mask = _mm_movemask_epi8(compareResults);
|
||||||
|
|
||||||
@ -283,7 +287,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
|
|||||||
{
|
{
|
||||||
if ((mask & 1) == 1)
|
if ((mask & 1) == 1)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
index = inputPtr - basePtr + i;
|
index = inputPtr - basePtr + i;
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
@ -297,9 +301,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
|
|||||||
inputPtr += 16;
|
inputPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i<(num_points % 16); ++i)
|
for (i = 0; i < (num_points % 16); ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -316,14 +320,14 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i = 1; i < num_points; ++i)
|
for (i = 1; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -337,11 +341,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
#include<immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int avx2_iters = num_points / 32;
|
const unsigned int avx2_iters = num_points / 32;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -351,14 +355,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
|
|||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
unsigned int mask;
|
unsigned int mask;
|
||||||
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
char currentValuesBuffer[32];
|
||||||
__m256i maxValues, compareResults, currentValues;
|
__m256i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm256_set1_epi8(max);
|
maxValues = _mm256_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < avx2_iters; number++)
|
for (number = 0; number < avx2_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
||||||
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
|
||||||
mask = _mm256_movemask_epi8(compareResults);
|
mask = _mm256_movemask_epi8(compareResults);
|
||||||
|
|
||||||
@ -371,7 +376,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
|
|||||||
{
|
{
|
||||||
if ((mask & 1) == 1)
|
if ((mask & 1) == 1)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
index = inputPtr - basePtr + i;
|
index = inputPtr - basePtr + i;
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
@ -385,9 +390,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
|
|||||||
inputPtr += 32;
|
inputPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i<(num_points % 32); ++i)
|
for (i = 0; i < (num_points % 32); ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -405,7 +410,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 32;
|
const unsigned int sse_iters = num_points / 32;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -414,19 +419,20 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
|
|||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
char currentValuesBuffer[32];
|
||||||
__m256i ones, compareResults, currentValues;
|
__m256i ones, compareResults, currentValues;
|
||||||
__m128i compareResultslo, compareResultshi, maxValues, lo, hi;
|
__m128i compareResultslo, compareResultshi, maxValues, lo, hi;
|
||||||
|
|
||||||
ones = _mm256_set1_epi8(0xFF);
|
ones = _mm256_set1_epi8(0xFF);
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
||||||
|
|
||||||
lo = _mm256_castsi256_si128(currentValues);
|
lo = _mm256_castsi256_si128(currentValues);
|
||||||
hi = _mm256_extractf128_si256(currentValues,1);
|
hi = _mm256_extractf128_si256(currentValues, 1);
|
||||||
|
|
||||||
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
|
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
|
||||||
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
|
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
|
||||||
@ -438,9 +444,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
|
|||||||
{
|
{
|
||||||
_mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues);
|
_mm256_store_si256((__m256i*)¤tValuesBuffer, currentValues);
|
||||||
|
|
||||||
for(i = 0; i < 32; i++)
|
for (i = 0; i < 32; i++)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
index = inputPtr - basePtr + i;
|
index = inputPtr - basePtr + i;
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
@ -452,9 +458,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
|
|||||||
inputPtr += 32;
|
inputPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i<(num_points % 32); ++i)
|
for (i = 0; i < (num_points % 32); ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -472,7 +478,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 16;
|
const unsigned int sse_iters = num_points / 16;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -481,14 +487,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
|
|||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char currentValuesBuffer[16];
|
||||||
__m128i maxValues, compareResults, currentValues;
|
__m128i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
||||||
|
|
||||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||||
|
|
||||||
@ -496,9 +503,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
|
|||||||
{
|
{
|
||||||
_mm_store_si128((__m128i*)¤tValuesBuffer, currentValues);
|
_mm_store_si128((__m128i*)¤tValuesBuffer, currentValues);
|
||||||
|
|
||||||
for(i = 0; i < 16; i++)
|
for (i = 0; i < 16; i++)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
index = inputPtr - basePtr + i;
|
index = inputPtr - basePtr + i;
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
@ -510,9 +517,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
|
|||||||
inputPtr += 16;
|
inputPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i<(num_points % 16); ++i)
|
for (i = 0; i < (num_points % 16); ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
@ -530,7 +537,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 16;
|
const unsigned int sse_iters = num_points / 16;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -540,14 +547,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
|
|||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int index = 0;
|
unsigned int index = 0;
|
||||||
unsigned short mask;
|
unsigned short mask;
|
||||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char currentValuesBuffer[16];
|
||||||
__m128i maxValues, compareResults, currentValues;
|
__m128i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
||||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||||
mask = _mm_movemask_epi8(compareResults);
|
mask = _mm_movemask_epi8(compareResults);
|
||||||
|
|
||||||
@ -560,7 +568,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
|
|||||||
{
|
{
|
||||||
if ((mask & 1) == 1)
|
if ((mask & 1) == 1)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
index = inputPtr - basePtr + i;
|
index = inputPtr - basePtr + i;
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
@ -574,9 +582,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
|
|||||||
inputPtr += 16;
|
inputPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i<(num_points % 16); ++i)
|
for (i = 0; i < (num_points % 16); ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
index = i;
|
index = i;
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
|
@ -63,21 +63,22 @@
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int avx_iters = num_points / 32;
|
const unsigned int avx_iters = num_points / 32;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
char maxValuesBuffer[32];
|
||||||
__m256i maxValues, compareResults, currentValues;
|
__m256i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm256_set1_epi8(max);
|
maxValues = _mm256_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < avx_iters; number++)
|
for (number = 0; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
|
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
|
||||||
compareResults = _mm256_max_epi8(maxValues, currentValues);
|
compareResults = _mm256_max_epi8(maxValues, currentValues);
|
||||||
maxValues = compareResults;
|
maxValues = compareResults;
|
||||||
inputPtr += 32;
|
inputPtr += 32;
|
||||||
@ -85,17 +86,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0
|
|||||||
|
|
||||||
_mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
|
_mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
|
||||||
|
|
||||||
for(i = 0; i < 32; ++i)
|
for (i = 0; i < 32; ++i)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[i] > max)
|
if (maxValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
max = maxValuesBuffer[i];
|
max = maxValuesBuffer[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = avx_iters * 32; i < num_points; ++i)
|
for (i = avx_iters * 32; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
}
|
}
|
||||||
@ -112,21 +113,22 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 16;
|
const unsigned int sse_iters = num_points / 16;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char maxValuesBuffer[16];
|
||||||
__m128i maxValues, compareResults, currentValues;
|
__m128i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
||||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||||
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
|
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
|
||||||
inputPtr += 16;
|
inputPtr += 16;
|
||||||
@ -134,17 +136,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
|
|||||||
|
|
||||||
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
|
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
|
||||||
|
|
||||||
for(i = 0; i < 16; ++i)
|
for (i = 0; i < 16; ++i)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[i] > max)
|
if (maxValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
max = maxValuesBuffer[i];
|
max = maxValuesBuffer[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 16; i < num_points; ++i)
|
for (i = sse_iters * 16; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
}
|
}
|
||||||
@ -157,11 +159,11 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
|
|||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
#include<emmintrin.h>
|
#include <emmintrin.h>
|
||||||
|
|
||||||
static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 16;
|
const unsigned int sse_iters = num_points / 16;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -169,14 +171,15 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
|
|||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned short mask;
|
unsigned short mask;
|
||||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char currentValuesBuffer[16];
|
||||||
__m128i maxValues, compareResults, currentValues;
|
__m128i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
|
||||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||||
mask = _mm_movemask_epi8(compareResults);
|
mask = _mm_movemask_epi8(compareResults);
|
||||||
|
|
||||||
@ -189,7 +192,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
|
|||||||
{
|
{
|
||||||
if ((mask & 1) == 1)
|
if ((mask & 1) == 1)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
}
|
}
|
||||||
@ -202,9 +205,9 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
|
|||||||
inputPtr += 16;
|
inputPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 16; i < num_points; ++i)
|
for (i = sse_iters * 16; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
}
|
}
|
||||||
@ -220,13 +223,13 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i = 1; i < num_points; ++i)
|
for (i = 1; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
}
|
}
|
||||||
@ -243,21 +246,22 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 16;
|
const unsigned int sse_iters = num_points / 16;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char maxValuesBuffer[16];
|
||||||
__m128i maxValues, compareResults, currentValues;
|
__m128i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
||||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||||
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
|
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
|
||||||
inputPtr += 16;
|
inputPtr += 16;
|
||||||
@ -265,17 +269,17 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
|
|||||||
|
|
||||||
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
|
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
|
||||||
|
|
||||||
for(i = 0; i < 16; ++i)
|
for (i = 0; i < 16; ++i)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[i] > max)
|
if (maxValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
max = maxValuesBuffer[i];
|
max = maxValuesBuffer[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 16; i < num_points; ++i)
|
for (i = sse_iters * 16; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
}
|
}
|
||||||
@ -292,39 +296,40 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int avx_iters = num_points / 32;
|
const unsigned int avx_iters = num_points / 32;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
char maxValuesBuffer[32];
|
||||||
__m256i maxValues, compareResults, currentValues;
|
__m256i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm256_set1_epi8(max);
|
maxValues = _mm256_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < avx_iters; number++)
|
for (number = 0; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
currentValues = _mm256_load_si256((__m256i*)inputPtr);
|
||||||
compareResults = _mm256_max_epi8(maxValues, currentValues);
|
compareResults = _mm256_max_epi8(maxValues, currentValues);
|
||||||
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
|
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
|
||||||
inputPtr += 32;
|
inputPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
|
_mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
|
||||||
|
|
||||||
for(i = 0; i < 32; ++i)
|
for (i = 0; i < 32; ++i)
|
||||||
{
|
{
|
||||||
if(maxValuesBuffer[i] > max)
|
if (maxValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
max = maxValuesBuffer[i];
|
max = maxValuesBuffer[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = avx_iters * 32; i < num_points; ++i)
|
for (i = avx_iters * 32; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
}
|
}
|
||||||
@ -341,7 +346,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0
|
|||||||
|
|
||||||
static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0, unsigned int num_points)
|
static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0, unsigned int num_points)
|
||||||
{
|
{
|
||||||
if(num_points > 0)
|
if (num_points > 0)
|
||||||
{
|
{
|
||||||
const unsigned int sse_iters = num_points / 16;
|
const unsigned int sse_iters = num_points / 16;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
@ -349,14 +354,15 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
|
|||||||
char* inputPtr = (char*)src0;
|
char* inputPtr = (char*)src0;
|
||||||
char max = src0[0];
|
char max = src0[0];
|
||||||
unsigned short mask;
|
unsigned short mask;
|
||||||
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
char currentValuesBuffer[16];
|
||||||
__m128i maxValues, compareResults, currentValues;
|
__m128i maxValues, compareResults, currentValues;
|
||||||
|
|
||||||
maxValues = _mm_set1_epi8(max);
|
maxValues = _mm_set1_epi8(max);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
currentValues = _mm_load_si128((__m128i*)inputPtr);
|
||||||
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
|
||||||
mask = _mm_movemask_epi8(compareResults);
|
mask = _mm_movemask_epi8(compareResults);
|
||||||
|
|
||||||
@ -369,7 +375,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
|
|||||||
{
|
{
|
||||||
if ((mask & 1) == 1)
|
if ((mask & 1) == 1)
|
||||||
{
|
{
|
||||||
if(currentValuesBuffer[i] > max)
|
if (currentValuesBuffer[i] > max)
|
||||||
{
|
{
|
||||||
max = currentValuesBuffer[i];
|
max = currentValuesBuffer[i];
|
||||||
}
|
}
|
||||||
@ -382,9 +388,9 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
|
|||||||
inputPtr += 16;
|
inputPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 16; i < num_points; ++i)
|
for (i = sse_iters * 16; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
if(src0[i] > max)
|
if (src0[i] > max)
|
||||||
{
|
{
|
||||||
max = src0[i];
|
max = src0[i];
|
||||||
}
|
}
|
||||||
|
@ -72,21 +72,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
|
|||||||
|
|
||||||
__m128i aVal, bVal, cVal;
|
__m128i aVal, bVal, cVal;
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm_loadu_si128((__m128i*)aPtr);
|
aVal = _mm_loadu_si128((__m128i*)aPtr);
|
||||||
bVal = _mm_loadu_si128((__m128i*)bPtr);
|
bVal = _mm_loadu_si128((__m128i*)bPtr);
|
||||||
|
|
||||||
cVal = _mm_add_epi8(aVal, bVal);
|
cVal = _mm_add_epi8(aVal, bVal);
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
|
_mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
|
||||||
|
|
||||||
aPtr += 16;
|
aPtr += 16;
|
||||||
bPtr += 16;
|
bPtr += 16;
|
||||||
cPtr += 16;
|
cPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 16; i < num_points; ++i)
|
for (i = sse_iters * 16; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||||
}
|
}
|
||||||
@ -108,21 +108,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* a
|
|||||||
|
|
||||||
__m256i aVal, bVal, cVal;
|
__m256i aVal, bVal, cVal;
|
||||||
|
|
||||||
for(number = 0; number < avx_iters; number++)
|
for (number = 0; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm256_loadu_si256((__m256i*)aPtr);
|
aVal = _mm256_loadu_si256((__m256i*)aPtr);
|
||||||
bVal = _mm256_loadu_si256((__m256i*)bPtr);
|
bVal = _mm256_loadu_si256((__m256i*)bPtr);
|
||||||
|
|
||||||
cVal = _mm256_add_epi8(aVal, bVal);
|
cVal = _mm256_add_epi8(aVal, bVal);
|
||||||
|
|
||||||
_mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
|
_mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
|
||||||
|
|
||||||
aPtr += 32;
|
aPtr += 32;
|
||||||
bPtr += 32;
|
bPtr += 32;
|
||||||
cPtr += 32;
|
cPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = avx_iters * 32; i < num_points; ++i)
|
for (i = avx_iters * 32; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||||
}
|
}
|
||||||
@ -139,7 +139,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char*
|
|||||||
const char* bPtr = bVector;
|
const char* bPtr = bVector;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
for(number = 0; number < num_points; number++)
|
for (number = 0; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||||
}
|
}
|
||||||
@ -161,21 +161,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
|
|||||||
|
|
||||||
__m128i aVal, bVal, cVal;
|
__m128i aVal, bVal, cVal;
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm_load_si128((__m128i*)aPtr);
|
aVal = _mm_load_si128((__m128i*)aPtr);
|
||||||
bVal = _mm_load_si128((__m128i*)bPtr);
|
bVal = _mm_load_si128((__m128i*)bPtr);
|
||||||
|
|
||||||
cVal = _mm_add_epi8(aVal, bVal);
|
cVal = _mm_add_epi8(aVal, bVal);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
|
_mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
|
||||||
|
|
||||||
aPtr += 16;
|
aPtr += 16;
|
||||||
bPtr += 16;
|
bPtr += 16;
|
||||||
cPtr += 16;
|
cPtr += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = sse_iters * 16; i < num_points; ++i)
|
for (i = sse_iters * 16; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||||
}
|
}
|
||||||
@ -197,21 +197,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* a
|
|||||||
|
|
||||||
__m256i aVal, bVal, cVal;
|
__m256i aVal, bVal, cVal;
|
||||||
|
|
||||||
for(number = 0; number < avx_iters; number++)
|
for (number = 0; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
aVal = _mm256_load_si256((__m256i*)aPtr);
|
aVal = _mm256_load_si256((__m256i*)aPtr);
|
||||||
bVal = _mm256_load_si256((__m256i*)bPtr);
|
bVal = _mm256_load_si256((__m256i*)bPtr);
|
||||||
|
|
||||||
cVal = _mm256_add_epi8(aVal, bVal);
|
cVal = _mm256_add_epi8(aVal, bVal);
|
||||||
|
|
||||||
_mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
|
_mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
|
||||||
|
|
||||||
aPtr += 32;
|
aPtr += 32;
|
||||||
bPtr += 32;
|
bPtr += 32;
|
||||||
cPtr += 32;
|
cPtr += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = avx_iters * 32; i < num_points; ++i)
|
for (i = avx_iters * 32; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
*cPtr++ = (*aPtr++) + (*bPtr++);
|
*cPtr++ = (*aPtr++) + (*bPtr++);
|
||||||
}
|
}
|
||||||
|
@ -111,10 +111,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
|
|||||||
tmp = _mm256_xor_ps(tmp, conjugator1);
|
tmp = _mm256_xor_ps(tmp, conjugator1);
|
||||||
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
|
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
|
||||||
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
||||||
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
|
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
|
||||||
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
||||||
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
||||||
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
|
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
|
||||||
_mm256_storeu_ps((float*)c, tmp);
|
_mm256_storeu_ps((float*)c, tmp);
|
||||||
|
|
||||||
a += 16;
|
a += 16;
|
||||||
@ -155,7 +155,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
|
|||||||
{
|
{
|
||||||
*c++ = lv_conj(*a++);
|
*c++ = lv_conj(*a++);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSSE3 */
|
#endif /* LV_HAVE_SSSE3 */
|
||||||
|
|
||||||
@ -188,7 +187,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
|
|||||||
{
|
{
|
||||||
*c++ = lv_conj(*a++);
|
*c++ = lv_conj(*a++);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE3 */
|
#endif /* LV_HAVE_SSE3 */
|
||||||
|
|
||||||
@ -201,7 +199,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con
|
|||||||
const lv_8sc_t* aPtr = aVector;
|
const lv_8sc_t* aPtr = aVector;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
for(number = 0; number < num_points; number++)
|
for (number = 0; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*cPtr++ = lv_conj(*aPtr++);
|
*cPtr++ = lv_conj(*aPtr++);
|
||||||
}
|
}
|
||||||
@ -230,10 +228,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
|
|||||||
tmp = _mm256_xor_ps(tmp, conjugator1);
|
tmp = _mm256_xor_ps(tmp, conjugator1);
|
||||||
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
|
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
|
||||||
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
|
||||||
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
|
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
|
||||||
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
|
||||||
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
|
||||||
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
|
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
|
||||||
_mm256_store_ps((float*)c, tmp);
|
_mm256_store_ps((float*)c, tmp);
|
||||||
|
|
||||||
a += 16;
|
a += 16;
|
||||||
@ -336,7 +334,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
|
|||||||
{
|
{
|
||||||
*c++ = lv_conj(*a++);
|
*c++ = lv_conj(*a++);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE3 */
|
#endif /* LV_HAVE_SSE3 */
|
||||||
|
|
||||||
|
@ -78,23 +78,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
|
|||||||
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
|
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||||
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
|
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
|
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
|
||||||
avectorlo = _mm_unpacklo_epi8 (avector, zero);
|
avectorlo = _mm_unpacklo_epi8(avector, zero);
|
||||||
avectorhi = _mm_unpackhi_epi8 (avector, zero);
|
avectorhi = _mm_unpackhi_epi8(avector, zero);
|
||||||
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
|
avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo);
|
||||||
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
|
avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi);
|
||||||
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
|
aadded = _mm_hadd_epi16(avectorlomult, avectorhimult);
|
||||||
|
|
||||||
complexVectorPtr += 16;
|
complexVectorPtr += 16;
|
||||||
|
|
||||||
bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
|
bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
|
||||||
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
|
bvectorlo = _mm_unpacklo_epi8(bvector, zero);
|
||||||
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
|
bvectorhi = _mm_unpackhi_epi8(bvector, zero);
|
||||||
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
|
bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo);
|
||||||
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
|
bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi);
|
||||||
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
|
badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult);
|
||||||
|
|
||||||
complexVectorPtr += 16;
|
complexVectorPtr += 16;
|
||||||
|
|
||||||
@ -162,11 +162,11 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude
|
|||||||
const char* complexVectorPtr = (char*)complexVector;
|
const char* complexVectorPtr = (char*)complexVector;
|
||||||
char* magnitudeVectorPtr = magnitudeVector;
|
char* magnitudeVectorPtr = magnitudeVector;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
for(number = 0; number < num_points; number++)
|
for (number = 0; number < num_points; number++)
|
||||||
{
|
{
|
||||||
const char real = *complexVectorPtr++;
|
const char real = *complexVectorPtr++;
|
||||||
const char imag = *complexVectorPtr++;
|
const char imag = *complexVectorPtr++;
|
||||||
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
|
*magnitudeVectorPtr++ = (real * real) + (imag * imag);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_GENERIC */
|
#endif /* LV_HAVE_GENERIC */
|
||||||
@ -192,23 +192,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
|
|||||||
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
|
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
|
||||||
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
|
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
avector = _mm_load_si128((__m128i*)complexVectorPtr);
|
avector = _mm_load_si128((__m128i*)complexVectorPtr);
|
||||||
avectorlo = _mm_unpacklo_epi8 (avector, zero);
|
avectorlo = _mm_unpacklo_epi8(avector, zero);
|
||||||
avectorhi = _mm_unpackhi_epi8 (avector, zero);
|
avectorhi = _mm_unpackhi_epi8(avector, zero);
|
||||||
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
|
avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo);
|
||||||
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
|
avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi);
|
||||||
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
|
aadded = _mm_hadd_epi16(avectorlomult, avectorhimult);
|
||||||
|
|
||||||
complexVectorPtr += 16;
|
complexVectorPtr += 16;
|
||||||
|
|
||||||
bvector = _mm_load_si128((__m128i*)complexVectorPtr);
|
bvector = _mm_load_si128((__m128i*)complexVectorPtr);
|
||||||
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
|
bvectorlo = _mm_unpacklo_epi8(bvector, zero);
|
||||||
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
|
bvectorhi = _mm_unpackhi_epi8(bvector, zero);
|
||||||
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
|
bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo);
|
||||||
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
|
bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi);
|
||||||
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
|
badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult);
|
||||||
|
|
||||||
complexVectorPtr += 16;
|
complexVectorPtr += 16;
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
|
|||||||
imagy = _mm_and_si128(imagy, mult1);
|
imagy = _mm_and_si128(imagy, mult1);
|
||||||
realy = _mm_and_si128(y, mult1);
|
realy = _mm_and_si128(y, mult1);
|
||||||
|
|
||||||
for(; number < sse_iters; number++)
|
for (; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_lddqu_si128((__m128i*)a);
|
x = _mm_lddqu_si128((__m128i*)a);
|
||||||
|
|
||||||
@ -111,7 +111,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
|
|||||||
{
|
{
|
||||||
*c++ = (*a++) * scalar;
|
*c++ = (*a++) * scalar;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE3 */
|
#endif /* LV_HAVE_SSE3 */
|
||||||
|
|
||||||
@ -173,7 +172,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
|
|||||||
imagy = _mm_and_si128(imagy, mult1);
|
imagy = _mm_and_si128(imagy, mult1);
|
||||||
realy = _mm_and_si128(y, mult1);
|
realy = _mm_and_si128(y, mult1);
|
||||||
|
|
||||||
for(; number < sse_iters; number++)
|
for (; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_load_si128((__m128i*)a);
|
x = _mm_load_si128((__m128i*)a);
|
||||||
|
|
||||||
@ -204,7 +203,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
|
|||||||
{
|
{
|
||||||
*c++ = (*a++) * scalar;
|
*c++ = (*a++) * scalar;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE3 */
|
#endif /* LV_HAVE_SSE3 */
|
||||||
|
|
||||||
|
@ -75,17 +75,17 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
|
|||||||
*cPtr += (*aPtr++) * (*bPtr++);
|
*cPtr += (*aPtr++) * (*bPtr++);
|
||||||
}*/
|
}*/
|
||||||
|
|
||||||
char * res = (char*) result;
|
char* res = (char*)result;
|
||||||
char * in = (char*) in_a;
|
char* in = (char*)in_a;
|
||||||
char * tp = (char*) in_b;
|
char* tp = (char*)in_b;
|
||||||
unsigned int n_2_ccomplex_blocks = num_points/2;
|
unsigned int n_2_ccomplex_blocks = num_points / 2;
|
||||||
unsigned int isodd = num_points & 1;
|
unsigned int isodd = num_points & 1;
|
||||||
|
|
||||||
char sum0[2] = {0,0};
|
char sum0[2] = {0, 0};
|
||||||
char sum1[2] = {0,0};
|
char sum1[2] = {0, 0};
|
||||||
unsigned int i = 0;
|
unsigned int i = 0;
|
||||||
|
|
||||||
for(i = 0; i < n_2_ccomplex_blocks; ++i)
|
for (i = 0; i < n_2_ccomplex_blocks; ++i)
|
||||||
{
|
{
|
||||||
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
|
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
|
||||||
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
|
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
|
||||||
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
|
|||||||
res[1] = sum0[1] + sum1[1];
|
res[1] = sum0[1] + sum1[1];
|
||||||
|
|
||||||
// Cleanup if we had an odd number of points
|
// Cleanup if we had an odd number of points
|
||||||
for(i = 0; i < isodd; ++i)
|
for (i = 0; i < isodd; ++i)
|
||||||
{
|
{
|
||||||
*result += in_a[num_points - 1] * in_b[num_points - 1];
|
*result += in_a[num_points - 1] * in_b[num_points - 1];
|
||||||
}
|
}
|
||||||
@ -115,13 +115,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
|
|||||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_8sc_t dotProduct;
|
lv_8sc_t dotProduct;
|
||||||
memset(&dotProduct, 0x0, 2*sizeof(char));
|
memset(&dotProduct, 0x0, 2 * sizeof(char));
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
const lv_8sc_t* a = in_a;
|
const lv_8sc_t* a = in_a;
|
||||||
const lv_8sc_t* b = in_b;
|
const lv_8sc_t* b = in_b;
|
||||||
|
|
||||||
const unsigned int sse_iters = num_points/8;
|
const unsigned int sse_iters = num_points / 8;
|
||||||
|
|
||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
@ -131,7 +131,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
|
|||||||
realcacc = _mm_setzero_si128();
|
realcacc = _mm_setzero_si128();
|
||||||
imagcacc = _mm_setzero_si128();
|
imagcacc = _mm_setzero_si128();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_loadu_si128((__m128i*)a);
|
x = _mm_loadu_si128((__m128i*)a);
|
||||||
y = _mm_loadu_si128((__m128i*)b);
|
y = _mm_loadu_si128((__m128i*)b);
|
||||||
@ -165,9 +165,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
|
|||||||
|
|
||||||
totalc = _mm_or_si128(realcacc, imagcacc);
|
totalc = _mm_or_si128(realcacc, imagcacc);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_8sc_t dotProductVector[8];
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
|
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
|
||||||
|
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
{
|
{
|
||||||
@ -192,13 +193,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
|
|||||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_8sc_t dotProduct;
|
lv_8sc_t dotProduct;
|
||||||
memset(&dotProduct, 0x0, 2*sizeof(char));
|
memset(&dotProduct, 0x0, 2 * sizeof(char));
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
const lv_8sc_t* a = in_a;
|
const lv_8sc_t* a = in_a;
|
||||||
const lv_8sc_t* b = in_b;
|
const lv_8sc_t* b = in_b;
|
||||||
|
|
||||||
const unsigned int sse_iters = num_points/8;
|
const unsigned int sse_iters = num_points / 8;
|
||||||
|
|
||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
@ -208,7 +209,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
|
|||||||
realcacc = _mm_setzero_si128();
|
realcacc = _mm_setzero_si128();
|
||||||
imagcacc = _mm_setzero_si128();
|
imagcacc = _mm_setzero_si128();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_lddqu_si128((__m128i*)a);
|
x = _mm_lddqu_si128((__m128i*)a);
|
||||||
y = _mm_lddqu_si128((__m128i*)b);
|
y = _mm_lddqu_si128((__m128i*)b);
|
||||||
@ -236,13 +237,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
|
|||||||
b += 8;
|
b += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
imagcacc = _mm_slli_si128(imagcacc, 1);
|
||||||
|
|
||||||
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
|
totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_8sc_t dotProductVector[8];
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
|
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
|
||||||
|
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
{
|
{
|
||||||
@ -267,13 +269,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
|
|||||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_8sc_t dotProduct;
|
lv_8sc_t dotProduct;
|
||||||
memset(&dotProduct, 0x0, 2*sizeof(char));
|
memset(&dotProduct, 0x0, 2 * sizeof(char));
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
const lv_8sc_t* a = in_a;
|
const lv_8sc_t* a = in_a;
|
||||||
const lv_8sc_t* b = in_b;
|
const lv_8sc_t* b = in_b;
|
||||||
|
|
||||||
const unsigned int sse_iters = num_points/8;
|
const unsigned int sse_iters = num_points / 8;
|
||||||
|
|
||||||
if (sse_iters > 0)
|
if (sse_iters > 0)
|
||||||
{
|
{
|
||||||
@ -283,7 +285,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
|
|||||||
realcacc = _mm_setzero_si128();
|
realcacc = _mm_setzero_si128();
|
||||||
imagcacc = _mm_setzero_si128();
|
imagcacc = _mm_setzero_si128();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_load_si128((__m128i*)a);
|
x = _mm_load_si128((__m128i*)a);
|
||||||
y = _mm_load_si128((__m128i*)b);
|
y = _mm_load_si128((__m128i*)b);
|
||||||
@ -317,9 +319,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
|
|||||||
|
|
||||||
totalc = _mm_or_si128(realcacc, imagcacc);
|
totalc = _mm_or_si128(realcacc, imagcacc);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_8sc_t dotProductVector[8];
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
|
||||||
|
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
{
|
{
|
||||||
@ -343,7 +346,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
|
|||||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_8sc_t dotProduct;
|
lv_8sc_t dotProduct;
|
||||||
memset(&dotProduct, 0x0, 2*sizeof(char));
|
memset(&dotProduct, 0x0, 2 * sizeof(char));
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
const lv_8sc_t* a = in_a;
|
const lv_8sc_t* a = in_a;
|
||||||
@ -359,7 +362,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
|
|||||||
realcacc = _mm_setzero_si128();
|
realcacc = _mm_setzero_si128();
|
||||||
imagcacc = _mm_setzero_si128();
|
imagcacc = _mm_setzero_si128();
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_load_si128((__m128i*)a);
|
x = _mm_load_si128((__m128i*)a);
|
||||||
y = _mm_load_si128((__m128i*)b);
|
y = _mm_load_si128((__m128i*)b);
|
||||||
@ -387,13 +390,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
|
|||||||
b += 8;
|
b += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
imagcacc = _mm_slli_si128 (imagcacc, 1);
|
imagcacc = _mm_slli_si128(imagcacc, 1);
|
||||||
|
|
||||||
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
|
totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_8sc_t dotProductVector[8];
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
|
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
|
||||||
|
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
{
|
{
|
||||||
@ -438,22 +442,23 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, cons
|
|||||||
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_8sc_t dotProduct;
|
lv_8sc_t dotProduct;
|
||||||
dotProduct = lv_cmake(0,0);
|
dotProduct = lv_cmake(0, 0);
|
||||||
*result = lv_cmake(0,0);
|
*result = lv_cmake(0, 0);
|
||||||
|
|
||||||
const lv_8sc_t* a = in_a;
|
const lv_8sc_t* a = in_a;
|
||||||
const lv_8sc_t* b = in_b;
|
const lv_8sc_t* b = in_b;
|
||||||
// for 2-lane vectors, 1st lane holds the real part,
|
// for 2-lane vectors, 1st lane holds the real part,
|
||||||
// 2nd lane holds the imaginary part
|
// 2nd lane holds the imaginary part
|
||||||
int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag;
|
int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag;
|
||||||
__VOLK_ATTR_ALIGNED(16) lv_8sc_t accum_result[8] = { lv_cmake(0,0) };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
lv_8sc_t accum_result[8] = {lv_cmake(0, 0)};
|
||||||
accumulator.val[0] = vdup_n_s8(0);
|
accumulator.val[0] = vdup_n_s8(0);
|
||||||
accumulator.val[1] = vdup_n_s8(0);
|
accumulator.val[1] = vdup_n_s8(0);
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
const unsigned int neon_iters = num_points / 8;
|
const unsigned int neon_iters = num_points / 8;
|
||||||
|
|
||||||
for(number = 0; number < neon_iters; ++number)
|
for (number = 0; number < neon_iters; ++number)
|
||||||
{
|
{
|
||||||
a_val = vld2_s8((const int8_t*)a);
|
a_val = vld2_s8((const int8_t*)a);
|
||||||
b_val = vld2_s8((const int8_t*)b);
|
b_val = vld2_s8((const int8_t*)b);
|
||||||
@ -478,7 +483,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const
|
|||||||
b += 8;
|
b += 8;
|
||||||
}
|
}
|
||||||
vst2_s8((int8_t*)accum_result, accumulator);
|
vst2_s8((int8_t*)accum_result, accumulator);
|
||||||
for(number = 0; number < 8; ++number)
|
for (number = 0; number < 8; ++number)
|
||||||
{
|
{
|
||||||
*result += accum_result[number];
|
*result += accum_result[number];
|
||||||
}
|
}
|
||||||
@ -490,6 +495,6 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const
|
|||||||
|
|
||||||
*result += dotProduct;
|
*result += dotProduct;
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_NEON */
|
#endif /* LV_HAVE_NEON */
|
||||||
|
|
||||||
#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_H*/
|
#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_H*/
|
||||||
|
@ -75,7 +75,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
|
|||||||
|
|
||||||
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_loadu_si128((__m128i*)a);
|
x = _mm_loadu_si128((__m128i*)a);
|
||||||
y = _mm_loadu_si128((__m128i*)b);
|
y = _mm_loadu_si128((__m128i*)b);
|
||||||
@ -133,7 +133,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
|
|||||||
_mm_setzero_si128();
|
_mm_setzero_si128();
|
||||||
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_lddqu_si128((__m128i*)a);
|
x = _mm_lddqu_si128((__m128i*)a);
|
||||||
y = _mm_lddqu_si128((__m128i*)b);
|
y = _mm_lddqu_si128((__m128i*)b);
|
||||||
@ -181,7 +181,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c
|
|||||||
const lv_8sc_t* bPtr = bVector;
|
const lv_8sc_t* bPtr = bVector;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
for(number = 0; number < num_points; number++)
|
for (number = 0; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||||
}
|
}
|
||||||
@ -204,7 +204,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
|
|||||||
|
|
||||||
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_load_si128((__m128i*)a);
|
x = _mm_load_si128((__m128i*)a);
|
||||||
y = _mm_load_si128((__m128i*)b);
|
y = _mm_load_si128((__m128i*)b);
|
||||||
@ -228,7 +228,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
|
|||||||
imagc = _mm_and_si128(imagc, mult1);
|
imagc = _mm_and_si128(imagc, mult1);
|
||||||
imagc = _mm_slli_si128(imagc, 1);
|
imagc = _mm_slli_si128(imagc, 1);
|
||||||
|
|
||||||
totalc = _mm_or_si128 (realc, imagc);
|
totalc = _mm_or_si128(realc, imagc);
|
||||||
|
|
||||||
_mm_store_si128((__m128i*)c, totalc);
|
_mm_store_si128((__m128i*)c, totalc);
|
||||||
|
|
||||||
@ -262,7 +262,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
|
|||||||
_mm_setzero_si128();
|
_mm_setzero_si128();
|
||||||
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_load_si128((__m128i*)a);
|
x = _mm_load_si128((__m128i*)a);
|
||||||
y = _mm_load_si128((__m128i*)b);
|
y = _mm_load_si128((__m128i*)b);
|
||||||
|
@ -72,7 +72,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c
|
|||||||
const unsigned char* a = aChar;
|
const unsigned char* a = aChar;
|
||||||
const unsigned char* b = bChar;
|
const unsigned char* b = bChar;
|
||||||
|
|
||||||
for(number = 0; number < avx2_iters; number++)
|
for (number = 0; number < avx2_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm256_loadu_si256((__m256i*)a);
|
x = _mm256_loadu_si256((__m256i*)a);
|
||||||
y = _mm256_loadu_si256((__m256i*)b);
|
y = _mm256_loadu_si256((__m256i*)b);
|
||||||
@ -101,7 +101,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c
|
|||||||
c += 32;
|
c += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = avx2_iters * 32; i < num_points ; ++i)
|
for (i = avx2_iters * 32; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
*c++ = (*a++) * (*b++);
|
*c++ = (*a++) * (*b++);
|
||||||
}
|
}
|
||||||
@ -123,7 +123,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
|
|||||||
const unsigned char* a = aChar;
|
const unsigned char* a = aChar;
|
||||||
const unsigned char* b = bChar;
|
const unsigned char* b = bChar;
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_lddqu_si128((__m128i*)a);
|
x = _mm_lddqu_si128((__m128i*)a);
|
||||||
y = _mm_lddqu_si128((__m128i*)b);
|
y = _mm_lddqu_si128((__m128i*)b);
|
||||||
@ -152,7 +152,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
|
|||||||
c += 16;
|
c += 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = sse_iters * 16; i < num_points ; ++i)
|
for (i = sse_iters * 16; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
*c++ = (*a++) * (*b++);
|
*c++ = (*a++) * (*b++);
|
||||||
}
|
}
|
||||||
@ -168,7 +168,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar,
|
|||||||
const unsigned char* bPtr = bChar;
|
const unsigned char* bPtr = bChar;
|
||||||
unsigned int number;
|
unsigned int number;
|
||||||
|
|
||||||
for(number = 0; number < num_points; number++)
|
for (number = 0; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*cPtr++ = (*aPtr++) * (*bPtr++);
|
*cPtr++ = (*aPtr++) * (*bPtr++);
|
||||||
}
|
}
|
||||||
@ -189,7 +189,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
|
|||||||
const unsigned char* a = aChar;
|
const unsigned char* a = aChar;
|
||||||
const unsigned char* b = bChar;
|
const unsigned char* b = bChar;
|
||||||
|
|
||||||
for(number = 0; number < sse_iters; number++)
|
for (number = 0; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm_load_si128((__m128i*)a);
|
x = _mm_load_si128((__m128i*)a);
|
||||||
y = _mm_load_si128((__m128i*)b);
|
y = _mm_load_si128((__m128i*)b);
|
||||||
@ -240,7 +240,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c
|
|||||||
const unsigned char* a = aChar;
|
const unsigned char* a = aChar;
|
||||||
const unsigned char* b = bChar;
|
const unsigned char* b = bChar;
|
||||||
|
|
||||||
for(number = 0; number < avx2_iters; number++)
|
for (number = 0; number < avx2_iters; number++)
|
||||||
{
|
{
|
||||||
x = _mm256_load_si256((__m256i*)a);
|
x = _mm256_load_si256((__m256i*)a);
|
||||||
y = _mm256_load_si256((__m256i*)b);
|
y = _mm256_load_si256((__m256i*)b);
|
||||||
@ -269,7 +269,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c
|
|||||||
c += 32;
|
c += 32;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = avx2_iters * 32; i < num_points ; ++i)
|
for (i = avx2_iters * 32; i < num_points; ++i)
|
||||||
{
|
{
|
||||||
*c++ = (*a++) * (*b++);
|
*c++ = (*a++) * (*b++);
|
||||||
}
|
}
|
||||||
|
@ -71,9 +71,9 @@
|
|||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
|
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t* bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
|
|
||||||
const unsigned int sse_iters = num_points / 4;
|
const unsigned int sse_iters = num_points / 4;
|
||||||
unsigned int number = 0;
|
unsigned int number = 0;
|
||||||
@ -84,44 +84,44 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
|
|||||||
__m128i emm0, emm2, emm4;
|
__m128i emm0, emm2, emm4;
|
||||||
|
|
||||||
/* declare some SSE constants */
|
/* declare some SSE constants */
|
||||||
static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
|
||||||
static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
|
||||||
|
|
||||||
static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
|
||||||
static const int _pi32_1[4] = { 1, 1, 1, 1 };
|
static const int _pi32_1[4] = {1, 1, 1, 1};
|
||||||
static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
|
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
|
||||||
static const int _pi32_2[4] = { 2, 2, 2, 2};
|
static const int _pi32_2[4] = {2, 2, 2, 2};
|
||||||
static const int _pi32_4[4] = { 4, 4, 4, 4};
|
static const int _pi32_4[4] = {4, 4, 4, 4};
|
||||||
|
|
||||||
static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
|
||||||
static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
|
||||||
static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
|
||||||
static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
|
||||||
static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
|
||||||
static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
|
||||||
static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
|
||||||
static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
|
||||||
static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
|
||||||
static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
|
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
|
||||||
static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
|
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
|
||||||
float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
|
float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
|
||||||
float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc };
|
float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc};
|
||||||
four_phases_reg = _mm_load_ps(four_phases);
|
four_phases_reg = _mm_load_ps(four_phases);
|
||||||
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
|
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
|
||||||
|
|
||||||
for(;number < sse_iters; number++)
|
for (; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = four_phases_reg;
|
x = four_phases_reg;
|
||||||
|
|
||||||
sign_bit_sin = x;
|
sign_bit_sin = x;
|
||||||
/* take the absolute value */
|
/* take the absolute value */
|
||||||
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
|
x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask);
|
||||||
/* extract the sign bit (upper one) */
|
/* extract the sign bit (upper one) */
|
||||||
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
|
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask);
|
||||||
|
|
||||||
/* scale by 4/Pi */
|
/* scale by 4/Pi */
|
||||||
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
|
y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI);
|
||||||
|
|
||||||
/* store the integer part of y in emm2 */
|
/* store the integer part of y in emm2 */
|
||||||
emm2 = _mm_cvttps_epi32(y);
|
emm2 = _mm_cvttps_epi32(y);
|
||||||
@ -145,9 +145,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
/* The magic pass: "Extended precision modular arithmetic”
|
/* The magic pass: "Extended precision modular arithmetic”
|
||||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||||
xmm1 = *(__m128*)_ps_minus_cephes_DP1;
|
xmm1 = *(__m128 *)_ps_minus_cephes_DP1;
|
||||||
xmm2 = *(__m128*)_ps_minus_cephes_DP2;
|
xmm2 = *(__m128 *)_ps_minus_cephes_DP2;
|
||||||
xmm3 = *(__m128*)_ps_minus_cephes_DP3;
|
xmm3 = *(__m128 *)_ps_minus_cephes_DP3;
|
||||||
xmm1 = _mm_mul_ps(y, xmm1);
|
xmm1 = _mm_mul_ps(y, xmm1);
|
||||||
xmm2 = _mm_mul_ps(y, xmm2);
|
xmm2 = _mm_mul_ps(y, xmm2);
|
||||||
xmm3 = _mm_mul_ps(y, xmm3);
|
xmm3 = _mm_mul_ps(y, xmm3);
|
||||||
@ -163,25 +163,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
|
|||||||
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
__m128 z = _mm_mul_ps(x,x);
|
__m128 z = _mm_mul_ps(x, x);
|
||||||
y = *(__m128*)_ps_coscof_p0;
|
y = *(__m128 *)_ps_coscof_p0;
|
||||||
|
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
|
y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1);
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
|
y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2);
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
|
__m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5);
|
||||||
y = _mm_sub_ps(y, tmp);
|
y = _mm_sub_ps(y, tmp);
|
||||||
y = _mm_add_ps(y, *(__m128*)_ps_1);
|
y = _mm_add_ps(y, *(__m128 *)_ps_1);
|
||||||
|
|
||||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||||
__m128 y2 = *(__m128*)_ps_sincof_p0;
|
__m128 y2 = *(__m128 *)_ps_sincof_p0;
|
||||||
y2 = _mm_mul_ps(y2, z);
|
y2 = _mm_mul_ps(y2, z);
|
||||||
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
|
y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1);
|
||||||
y2 = _mm_mul_ps(y2, z);
|
y2 = _mm_mul_ps(y2, z);
|
||||||
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
|
y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2);
|
||||||
y2 = _mm_mul_ps(y2, z);
|
y2 = _mm_mul_ps(y2, z);
|
||||||
y2 = _mm_mul_ps(y2, x);
|
y2 = _mm_mul_ps(y2, x);
|
||||||
y2 = _mm_add_ps(y2, x);
|
y2 = _mm_add_ps(y2, x);
|
||||||
@ -190,11 +190,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
|
|||||||
xmm3 = poly_mask;
|
xmm3 = poly_mask;
|
||||||
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
||||||
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
||||||
y2 = _mm_sub_ps(y2,ysin2);
|
y2 = _mm_sub_ps(y2, ysin2);
|
||||||
y = _mm_sub_ps(y, ysin1);
|
y = _mm_sub_ps(y, ysin1);
|
||||||
|
|
||||||
xmm1 = _mm_add_ps(ysin1,ysin2);
|
xmm1 = _mm_add_ps(ysin1, ysin2);
|
||||||
xmm2 = _mm_add_ps(y,y2);
|
xmm2 = _mm_add_ps(y, y2);
|
||||||
|
|
||||||
/* update the sign */
|
/* update the sign */
|
||||||
sine = _mm_xor_ps(xmm1, sign_bit_sin);
|
sine = _mm_xor_ps(xmm1, sign_bit_sin);
|
||||||
@ -202,19 +202,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
/* write the output */
|
/* write the output */
|
||||||
aux = _mm_unpacklo_ps(cosine, sine);
|
aux = _mm_unpacklo_ps(cosine, sine);
|
||||||
_mm_store_ps((float*)bPtr, aux);
|
_mm_store_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
aux = _mm_unpackhi_ps(cosine, sine);
|
aux = _mm_unpackhi_ps(cosine, sine);
|
||||||
_mm_store_ps((float*)bPtr, aux);
|
_mm_store_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
|
|
||||||
four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg);
|
four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
_phase = _phase + phase_inc * (sse_iters * 4);
|
_phase = _phase + phase_inc * (sse_iters * 4);
|
||||||
for(number = sse_iters * 4; number < num_points; number++)
|
for (number = sse_iters * 4; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase)) );
|
*bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase)));
|
||||||
_phase += phase_inc;
|
_phase += phase_inc;
|
||||||
}
|
}
|
||||||
(*phase) = _phase;
|
(*phase) = _phase;
|
||||||
@ -227,9 +227,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
|
|||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
|
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t* bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
|
|
||||||
const unsigned int sse_iters = num_points / 4;
|
const unsigned int sse_iters = num_points / 4;
|
||||||
unsigned int number = 0;
|
unsigned int number = 0;
|
||||||
@ -241,44 +241,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
|
|||||||
__m128i emm0, emm2, emm4;
|
__m128i emm0, emm2, emm4;
|
||||||
|
|
||||||
/* declare some SSE constants */
|
/* declare some SSE constants */
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
|
static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
|
static const int _pi32_1[4] = {1, 1, 1, 1};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_2[4] = {2, 2, 2, 2};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const int _pi32_4[4] = {4, 4, 4, 4};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
|
static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
|
||||||
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
__VOLK_ATTR_ALIGNED(16) float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc };
|
float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
|
||||||
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc};
|
||||||
four_phases_reg = _mm_load_ps(four_phases);
|
four_phases_reg = _mm_load_ps(four_phases);
|
||||||
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
|
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
|
||||||
|
|
||||||
for(;number < sse_iters; number++)
|
for (; number < sse_iters; number++)
|
||||||
{
|
{
|
||||||
x = four_phases_reg;
|
x = four_phases_reg;
|
||||||
|
|
||||||
sign_bit_sin = x;
|
sign_bit_sin = x;
|
||||||
/* take the absolute value */
|
/* take the absolute value */
|
||||||
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
|
x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask);
|
||||||
/* extract the sign bit (upper one) */
|
/* extract the sign bit (upper one) */
|
||||||
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
|
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask);
|
||||||
|
|
||||||
/* scale by 4/Pi */
|
/* scale by 4/Pi */
|
||||||
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
|
y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI);
|
||||||
|
|
||||||
/* store the integer part of y in emm2 */
|
/* store the integer part of y in emm2 */
|
||||||
emm2 = _mm_cvttps_epi32(y);
|
emm2 = _mm_cvttps_epi32(y);
|
||||||
@ -302,9 +322,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
/* The magic pass: "Extended precision modular arithmetic”
|
/* The magic pass: "Extended precision modular arithmetic”
|
||||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||||
xmm1 = *(__m128*)_ps_minus_cephes_DP1;
|
xmm1 = *(__m128 *)_ps_minus_cephes_DP1;
|
||||||
xmm2 = *(__m128*)_ps_minus_cephes_DP2;
|
xmm2 = *(__m128 *)_ps_minus_cephes_DP2;
|
||||||
xmm3 = *(__m128*)_ps_minus_cephes_DP3;
|
xmm3 = *(__m128 *)_ps_minus_cephes_DP3;
|
||||||
xmm1 = _mm_mul_ps(y, xmm1);
|
xmm1 = _mm_mul_ps(y, xmm1);
|
||||||
xmm2 = _mm_mul_ps(y, xmm2);
|
xmm2 = _mm_mul_ps(y, xmm2);
|
||||||
xmm3 = _mm_mul_ps(y, xmm3);
|
xmm3 = _mm_mul_ps(y, xmm3);
|
||||||
@ -320,25 +340,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
|
|||||||
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
|
||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
__m128 z = _mm_mul_ps(x,x);
|
__m128 z = _mm_mul_ps(x, x);
|
||||||
y = *(__m128*)_ps_coscof_p0;
|
y = *(__m128 *)_ps_coscof_p0;
|
||||||
|
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
|
y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1);
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
|
y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2);
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
y = _mm_mul_ps(y, z);
|
y = _mm_mul_ps(y, z);
|
||||||
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
|
__m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5);
|
||||||
y = _mm_sub_ps(y, tmp);
|
y = _mm_sub_ps(y, tmp);
|
||||||
y = _mm_add_ps(y, *(__m128*)_ps_1);
|
y = _mm_add_ps(y, *(__m128 *)_ps_1);
|
||||||
|
|
||||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||||
__m128 y2 = *(__m128*)_ps_sincof_p0;
|
__m128 y2 = *(__m128 *)_ps_sincof_p0;
|
||||||
y2 = _mm_mul_ps(y2, z);
|
y2 = _mm_mul_ps(y2, z);
|
||||||
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
|
y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1);
|
||||||
y2 = _mm_mul_ps(y2, z);
|
y2 = _mm_mul_ps(y2, z);
|
||||||
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
|
y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2);
|
||||||
y2 = _mm_mul_ps(y2, z);
|
y2 = _mm_mul_ps(y2, z);
|
||||||
y2 = _mm_mul_ps(y2, x);
|
y2 = _mm_mul_ps(y2, x);
|
||||||
y2 = _mm_add_ps(y2, x);
|
y2 = _mm_add_ps(y2, x);
|
||||||
@ -347,11 +367,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
|
|||||||
xmm3 = poly_mask;
|
xmm3 = poly_mask;
|
||||||
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
__m128 ysin2 = _mm_and_ps(xmm3, y2);
|
||||||
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
|
||||||
y2 = _mm_sub_ps(y2,ysin2);
|
y2 = _mm_sub_ps(y2, ysin2);
|
||||||
y = _mm_sub_ps(y, ysin1);
|
y = _mm_sub_ps(y, ysin1);
|
||||||
|
|
||||||
xmm1 = _mm_add_ps(ysin1,ysin2);
|
xmm1 = _mm_add_ps(ysin1, ysin2);
|
||||||
xmm2 = _mm_add_ps(y,y2);
|
xmm2 = _mm_add_ps(y, y2);
|
||||||
|
|
||||||
/* update the sign */
|
/* update the sign */
|
||||||
sine = _mm_xor_ps(xmm1, sign_bit_sin);
|
sine = _mm_xor_ps(xmm1, sign_bit_sin);
|
||||||
@ -359,19 +379,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
/* write the output */
|
/* write the output */
|
||||||
aux = _mm_unpacklo_ps(cosine, sine);
|
aux = _mm_unpacklo_ps(cosine, sine);
|
||||||
_mm_storeu_ps((float*)bPtr, aux);
|
_mm_storeu_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
aux = _mm_unpackhi_ps(cosine, sine);
|
aux = _mm_unpackhi_ps(cosine, sine);
|
||||||
_mm_storeu_ps((float*)bPtr, aux);
|
_mm_storeu_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
|
|
||||||
four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg);
|
four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
_phase = _phase + phase_inc * (sse_iters * 4);
|
_phase = _phase + phase_inc * (sse_iters * 4);
|
||||||
for(number = sse_iters * 4; number < num_points; number++)
|
for (number = sse_iters * 4; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
|
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
|
||||||
_phase += phase_inc;
|
_phase += phase_inc;
|
||||||
}
|
}
|
||||||
(*phase) = _phase;
|
(*phase) = _phase;
|
||||||
@ -382,13 +402,13 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
|
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
float _phase = (*phase);
|
float _phase = (*phase);
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i = 0; i < num_points; i++)
|
for (i = 0; i < num_points; i++)
|
||||||
{
|
{
|
||||||
*out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
|
*out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
|
||||||
_phase += phase_inc;
|
_phase += phase_inc;
|
||||||
}
|
}
|
||||||
(*phase) = _phase;
|
(*phase) = _phase;
|
||||||
@ -400,7 +420,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const f
|
|||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
#include <volk_gnsssdr/volk_gnsssdr_sine_table.h>
|
#include <volk_gnsssdr/volk_gnsssdr_sine_table.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
float _in, s, c;
|
float _in, s, c;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
@ -413,12 +433,12 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
|
|||||||
const int32_t diffbits = bitlength - Nbits;
|
const int32_t diffbits = bitlength - Nbits;
|
||||||
uint32_t ux;
|
uint32_t ux;
|
||||||
float _phase = (*phase);
|
float _phase = (*phase);
|
||||||
for(i = 0; i < num_points; i++)
|
for (i = 0; i < num_points; i++)
|
||||||
{
|
{
|
||||||
_in = _phase;
|
_in = _phase;
|
||||||
d = (int32_t)floor(_in / TWO_PI + 0.5);
|
d = (int32_t)floor(_in / TWO_PI + 0.5);
|
||||||
_in -= d * TWO_PI;
|
_in -= d * TWO_PI;
|
||||||
x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI);
|
x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI);
|
||||||
|
|
||||||
ux = x;
|
ux = x;
|
||||||
sin_index = ux >> diffbits;
|
sin_index = ux >> diffbits;
|
||||||
@ -428,7 +448,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
|
|||||||
cos_index = ux >> diffbits;
|
cos_index = ux >> diffbits;
|
||||||
c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1];
|
c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1];
|
||||||
|
|
||||||
*out++ = lv_cmake((float)c, (float)s );
|
*out++ = lv_cmake((float)c, (float)s);
|
||||||
_phase += phase_inc;
|
_phase += phase_inc;
|
||||||
}
|
}
|
||||||
(*phase) = _phase;
|
(*phase) = _phase;
|
||||||
@ -441,9 +461,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
|
|||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
|
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
|
||||||
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
|
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t* bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
|
|
||||||
const unsigned int avx_iters = num_points / 8;
|
const unsigned int avx_iters = num_points / 8;
|
||||||
unsigned int number = 0;
|
unsigned int number = 0;
|
||||||
@ -456,44 +476,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
|
|||||||
__m128 aux, c1, s1;
|
__m128 aux, c1, s1;
|
||||||
|
|
||||||
/* declare some AXX2 constants */
|
/* declare some AXX2 constants */
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
|
static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 };
|
static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
|
static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc };
|
float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc};
|
||||||
eight_phases_reg = _mm256_load_ps(eight_phases);
|
eight_phases_reg = _mm256_load_ps(eight_phases);
|
||||||
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
|
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
|
||||||
|
|
||||||
for(;number < avx_iters; number++)
|
for (; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
x = eight_phases_reg;
|
x = eight_phases_reg;
|
||||||
|
|
||||||
sign_bit_sin = x;
|
sign_bit_sin = x;
|
||||||
/* take the absolute value */
|
/* take the absolute value */
|
||||||
x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask);
|
x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask);
|
||||||
/* extract the sign bit (upper one) */
|
/* extract the sign bit (upper one) */
|
||||||
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask);
|
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask);
|
||||||
|
|
||||||
/* scale by 4/Pi */
|
/* scale by 4/Pi */
|
||||||
y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI);
|
y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI);
|
||||||
|
|
||||||
/* store the integer part of y in emm2 */
|
/* store the integer part of y in emm2 */
|
||||||
emm2 = _mm256_cvttps_epi32(y);
|
emm2 = _mm256_cvttps_epi32(y);
|
||||||
@ -517,9 +557,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
/* The magic pass: "Extended precision modular arithmetic”
|
/* The magic pass: "Extended precision modular arithmetic”
|
||||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||||
xmm1 = *(__m256*)_ps_minus_cephes_DP1;
|
xmm1 = *(__m256 *)_ps_minus_cephes_DP1;
|
||||||
xmm2 = *(__m256*)_ps_minus_cephes_DP2;
|
xmm2 = *(__m256 *)_ps_minus_cephes_DP2;
|
||||||
xmm3 = *(__m256*)_ps_minus_cephes_DP3;
|
xmm3 = *(__m256 *)_ps_minus_cephes_DP3;
|
||||||
xmm1 = _mm256_mul_ps(y, xmm1);
|
xmm1 = _mm256_mul_ps(y, xmm1);
|
||||||
xmm2 = _mm256_mul_ps(y, xmm2);
|
xmm2 = _mm256_mul_ps(y, xmm2);
|
||||||
xmm3 = _mm256_mul_ps(y, xmm3);
|
xmm3 = _mm256_mul_ps(y, xmm3);
|
||||||
@ -536,24 +576,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
__m256 z = _mm256_mul_ps(x, x);
|
__m256 z = _mm256_mul_ps(x, x);
|
||||||
y = *(__m256*)_ps_coscof_p0;
|
y = *(__m256 *)_ps_coscof_p0;
|
||||||
|
|
||||||
y = _mm256_mul_ps(y, z);
|
y = _mm256_mul_ps(y, z);
|
||||||
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1);
|
y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1);
|
||||||
y = _mm256_mul_ps(y, z);
|
y = _mm256_mul_ps(y, z);
|
||||||
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2);
|
y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2);
|
||||||
y = _mm256_mul_ps(y, z);
|
y = _mm256_mul_ps(y, z);
|
||||||
y = _mm256_mul_ps(y, z);
|
y = _mm256_mul_ps(y, z);
|
||||||
__m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5);
|
__m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5);
|
||||||
y = _mm256_sub_ps(y, tmp);
|
y = _mm256_sub_ps(y, tmp);
|
||||||
y = _mm256_add_ps(y, *(__m256*)_ps_1);
|
y = _mm256_add_ps(y, *(__m256 *)_ps_1);
|
||||||
|
|
||||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||||
__m256 y2 = *(__m256*)_ps_sincof_p0;
|
__m256 y2 = *(__m256 *)_ps_sincof_p0;
|
||||||
y2 = _mm256_mul_ps(y2, z);
|
y2 = _mm256_mul_ps(y2, z);
|
||||||
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1);
|
y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1);
|
||||||
y2 = _mm256_mul_ps(y2, z);
|
y2 = _mm256_mul_ps(y2, z);
|
||||||
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2);
|
y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2);
|
||||||
y2 = _mm256_mul_ps(y2, z);
|
y2 = _mm256_mul_ps(y2, z);
|
||||||
y2 = _mm256_mul_ps(y2, x);
|
y2 = _mm256_mul_ps(y2, x);
|
||||||
y2 = _mm256_add_ps(y2, x);
|
y2 = _mm256_add_ps(y2, x);
|
||||||
@ -576,27 +616,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
|
|||||||
s1 = _mm256_extractf128_ps(sine, 0);
|
s1 = _mm256_extractf128_ps(sine, 0);
|
||||||
c1 = _mm256_extractf128_ps(cosine, 0);
|
c1 = _mm256_extractf128_ps(cosine, 0);
|
||||||
aux = _mm_unpacklo_ps(c1, s1);
|
aux = _mm_unpacklo_ps(c1, s1);
|
||||||
_mm_store_ps((float*)bPtr, aux);
|
_mm_store_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
aux = _mm_unpackhi_ps(c1, s1);
|
aux = _mm_unpackhi_ps(c1, s1);
|
||||||
_mm_store_ps((float*)bPtr, aux);
|
_mm_store_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
s1 = _mm256_extractf128_ps(sine, 1);
|
s1 = _mm256_extractf128_ps(sine, 1);
|
||||||
c1 = _mm256_extractf128_ps(cosine, 1);
|
c1 = _mm256_extractf128_ps(cosine, 1);
|
||||||
aux = _mm_unpacklo_ps(c1, s1);
|
aux = _mm_unpacklo_ps(c1, s1);
|
||||||
_mm_store_ps((float*)bPtr, aux);
|
_mm_store_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
aux = _mm_unpackhi_ps(c1, s1);
|
aux = _mm_unpackhi_ps(c1, s1);
|
||||||
_mm_store_ps((float*)bPtr, aux);
|
_mm_store_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
|
|
||||||
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
|
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
|
||||||
}
|
}
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
_phase = _phase + phase_inc * (avx_iters * 8);
|
_phase = _phase + phase_inc * (avx_iters * 8);
|
||||||
for(number = avx_iters * 8; number < num_points; number++)
|
for (number = avx_iters * 8; number < num_points; number++)
|
||||||
{
|
{
|
||||||
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
|
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
|
||||||
_phase += phase_inc;
|
_phase += phase_inc;
|
||||||
}
|
}
|
||||||
(*phase) = _phase;
|
(*phase) = _phase;
|
||||||
@ -609,9 +649,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
|
|||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
|
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
|
||||||
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
|
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t* bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
|
|
||||||
const unsigned int avx_iters = num_points / 8;
|
const unsigned int avx_iters = num_points / 8;
|
||||||
unsigned int number = 0;
|
unsigned int number = 0;
|
||||||
@ -624,44 +664,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
|
|||||||
__m128 aux, c1, s1;
|
__m128 aux, c1, s1;
|
||||||
|
|
||||||
/* declare some AXX2 constants */
|
/* declare some AXX2 constants */
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
|
static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
|
static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 };
|
static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
|
static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
|
static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
|
static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
|
static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
|
static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
|
||||||
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc };
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc };
|
float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc};
|
||||||
|
__VOLK_ATTR_ALIGNED(32)
|
||||||
|
float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc};
|
||||||
eight_phases_reg = _mm256_load_ps(eight_phases);
|
eight_phases_reg = _mm256_load_ps(eight_phases);
|
||||||
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
|
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
|
||||||
|
|
||||||
for(;number < avx_iters; number++)
|
for (; number < avx_iters; number++)
|
||||||
{
|
{
|
||||||
x = eight_phases_reg;
|
x = eight_phases_reg;
|
||||||
|
|
||||||
sign_bit_sin = x;
|
sign_bit_sin = x;
|
||||||
/* take the absolute value */
|
/* take the absolute value */
|
||||||
x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask);
|
x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask);
|
||||||
/* extract the sign bit (upper one) */
|
/* extract the sign bit (upper one) */
|
||||||
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask);
|
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask);
|
||||||
|
|
||||||
/* scale by 4/Pi */
|
/* scale by 4/Pi */
|
||||||
y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI);
|
y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI);
|
||||||
|
|
||||||
/* store the integer part of y in emm2 */
|
/* store the integer part of y in emm2 */
|
||||||
emm2 = _mm256_cvttps_epi32(y);
|
emm2 = _mm256_cvttps_epi32(y);
|
||||||
@ -685,9 +745,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
/* The magic pass: "Extended precision modular arithmetic”
|
/* The magic pass: "Extended precision modular arithmetic”
|
||||||
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
x = ((x - y * DP1) - y * DP2) - y * DP3; */
|
||||||
xmm1 = *(__m256*)_ps_minus_cephes_DP1;
|
xmm1 = *(__m256 *)_ps_minus_cephes_DP1;
|
||||||
xmm2 = *(__m256*)_ps_minus_cephes_DP2;
|
xmm2 = *(__m256 *)_ps_minus_cephes_DP2;
|
||||||
xmm3 = *(__m256*)_ps_minus_cephes_DP3;
|
xmm3 = *(__m256 *)_ps_minus_cephes_DP3;
|
||||||
xmm1 = _mm256_mul_ps(y, xmm1);
|
xmm1 = _mm256_mul_ps(y, xmm1);
|
||||||
xmm2 = _mm256_mul_ps(y, xmm2);
|
xmm2 = _mm256_mul_ps(y, xmm2);
|
||||||
xmm3 = _mm256_mul_ps(y, xmm3);
|
xmm3 = _mm256_mul_ps(y, xmm3);
|
||||||
@ -704,24 +764,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
|
|||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
/* Evaluate the first polynom (0 <= x <= Pi/4) */
|
||||||
__m256 z = _mm256_mul_ps(x, x);
|
__m256 z = _mm256_mul_ps(x, x);
|
||||||
y = *(__m256*)_ps_coscof_p0;
|
y = *(__m256 *)_ps_coscof_p0;
|
||||||
|
|
||||||
y = _mm256_mul_ps(y, z);
|
y = _mm256_mul_ps(y, z);
|
||||||
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1);
|
y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1);
|
||||||
y = _mm256_mul_ps(y, z);
|
y = _mm256_mul_ps(y, z);
|
||||||
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2);
|
y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2);
|
||||||
y = _mm256_mul_ps(y, z);
|
y = _mm256_mul_ps(y, z);
|
||||||
y = _mm256_mul_ps(y, z);
|
y = _mm256_mul_ps(y, z);
|
||||||
__m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5);
|
__m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5);
|
||||||
y = _mm256_sub_ps(y, tmp);
|
y = _mm256_sub_ps(y, tmp);
|
||||||
y = _mm256_add_ps(y, *(__m256*)_ps_1);
|
y = _mm256_add_ps(y, *(__m256 *)_ps_1);
|
||||||
|
|
||||||
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
|
||||||
__m256 y2 = *(__m256*)_ps_sincof_p0;
|
__m256 y2 = *(__m256 *)_ps_sincof_p0;
|
||||||
y2 = _mm256_mul_ps(y2, z);
|
y2 = _mm256_mul_ps(y2, z);
|
||||||
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1);
|
y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1);
|
||||||
y2 = _mm256_mul_ps(y2, z);
|
y2 = _mm256_mul_ps(y2, z);
|
||||||
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2);
|
y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2);
|
||||||
y2 = _mm256_mul_ps(y2, z);
|
y2 = _mm256_mul_ps(y2, z);
|
||||||
y2 = _mm256_mul_ps(y2, x);
|
y2 = _mm256_mul_ps(y2, x);
|
||||||
y2 = _mm256_add_ps(y2, x);
|
y2 = _mm256_add_ps(y2, x);
|
||||||
@ -744,27 +804,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
|
|||||||
s1 = _mm256_extractf128_ps(sine, 0);
|
s1 = _mm256_extractf128_ps(sine, 0);
|
||||||
c1 = _mm256_extractf128_ps(cosine, 0);
|
c1 = _mm256_extractf128_ps(cosine, 0);
|
||||||
aux = _mm_unpacklo_ps(c1, s1);
|
aux = _mm_unpacklo_ps(c1, s1);
|
||||||
_mm_storeu_ps((float*)bPtr, aux);
|
_mm_storeu_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
aux = _mm_unpackhi_ps(c1, s1);
|
aux = _mm_unpackhi_ps(c1, s1);
|
||||||
_mm_storeu_ps((float*)bPtr, aux);
|
_mm_storeu_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
s1 = _mm256_extractf128_ps(sine, 1);
|
s1 = _mm256_extractf128_ps(sine, 1);
|
||||||
c1 = _mm256_extractf128_ps(cosine, 1);
|
c1 = _mm256_extractf128_ps(cosine, 1);
|
||||||
aux = _mm_unpacklo_ps(c1, s1);
|
aux = _mm_unpacklo_ps(c1, s1);
|
||||||
_mm_storeu_ps((float*)bPtr, aux);
|
_mm_storeu_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
aux = _mm_unpackhi_ps(c1, s1);
|
aux = _mm_unpackhi_ps(c1, s1);
|
||||||
_mm_storeu_ps((float*)bPtr, aux);
|
_mm_storeu_ps((float *)bPtr, aux);
|
||||||
bPtr += 2;
|
bPtr += 2;
|
||||||
|
|
||||||
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
|
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
|
||||||
}
|
}
|
||||||
_mm256_zeroupper();
|
_mm256_zeroupper();
|
||||||
_phase = _phase + phase_inc * (avx_iters * 8);
|
_phase = _phase + phase_inc * (avx_iters * 8);
|
||||||
for(number = avx_iters * 8; number < num_points; number++)
|
for (number = avx_iters * 8; number < num_points; number++)
|
||||||
{
|
{
|
||||||
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
|
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
|
||||||
_phase += phase_inc;
|
_phase += phase_inc;
|
||||||
}
|
}
|
||||||
(*phase) = _phase;
|
(*phase) = _phase;
|
||||||
@ -777,15 +837,17 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
|
|||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */
|
/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */
|
||||||
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
|
||||||
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
|
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
|
||||||
{
|
{
|
||||||
lv_32fc_t* bPtr = out;
|
lv_32fc_t *bPtr = out;
|
||||||
const unsigned int neon_iters = num_points / 4;
|
const unsigned int neon_iters = num_points / 4;
|
||||||
float _phase = (*phase);
|
float _phase = (*phase);
|
||||||
|
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
|
||||||
float four_inc = 4 * phase_inc;
|
float four_inc = 4 * phase_inc;
|
||||||
__VOLK_ATTR_ALIGNED(16) float32_t four_phases_inc[4] = { four_inc, four_inc, four_inc, four_inc };
|
__VOLK_ATTR_ALIGNED(16)
|
||||||
|
float32_t four_phases_inc[4] = {four_inc, four_inc, four_inc, four_inc};
|
||||||
|
|
||||||
float32x4_t four_phases_reg = vld1q_f32(four_phases);
|
float32x4_t four_phases_reg = vld1q_f32(four_phases);
|
||||||
float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc);
|
float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc);
|
||||||
@ -808,7 +870,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
|
|||||||
|
|
||||||
uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos;
|
uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos;
|
||||||
|
|
||||||
for(;number < neon_iters; number++)
|
for (; number < neon_iters; number++)
|
||||||
{
|
{
|
||||||
x = four_phases_reg;
|
x = four_phases_reg;
|
||||||
|
|
||||||
@ -847,7 +909,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
|
|||||||
|
|
||||||
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
|
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
|
||||||
and the second polynom (Pi/4 <= x <= 0) in y2 */
|
and the second polynom (Pi/4 <= x <= 0) in y2 */
|
||||||
z = vmulq_f32(x,x);
|
z = vmulq_f32(x, x);
|
||||||
|
|
||||||
y1 = vmulq_n_f32(z, c_coscof_p0);
|
y1 = vmulq_n_f32(z, c_coscof_p0);
|
||||||
y2 = vmulq_n_f32(z, c_sincof_p0);
|
y2 = vmulq_n_f32(z, c_sincof_p0);
|
||||||
@ -871,16 +933,16 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
|
|||||||
result.val[1] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
|
result.val[1] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
|
||||||
result.val[0] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
|
result.val[0] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
|
||||||
|
|
||||||
vst2q_f32((float32_t*)bPtr, result);
|
vst2q_f32((float32_t *)bPtr, result);
|
||||||
bPtr += 4;
|
bPtr += 4;
|
||||||
|
|
||||||
four_phases_reg = vaddq_f32(four_phases_reg, four_phases_inc_reg);
|
four_phases_reg = vaddq_f32(four_phases_reg, four_phases_inc_reg);
|
||||||
}
|
}
|
||||||
|
|
||||||
_phase = _phase + phase_inc * (neon_iters * 4);
|
_phase = _phase + phase_inc * (neon_iters * 4);
|
||||||
for(number = neon_iters * 4; number < num_points; number++)
|
for (number = neon_iters * 4; number < num_points; number++)
|
||||||
{
|
{
|
||||||
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
|
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
|
||||||
_phase += phase_inc;
|
_phase += phase_inc;
|
||||||
}
|
}
|
||||||
(*phase) = _phase;
|
(*phase) = _phase;
|
||||||
|
@ -49,7 +49,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic(lv_32fc_t* out, c
|
|||||||
volk_gnsssdr_s32f_sincos_32fc_generic(out, phase_inc, phase, num_points);
|
volk_gnsssdr_s32f_sincos_32fc_generic(out, phase_inc, phase, num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_GENERIC */
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_GENERIC
|
#ifdef LV_HAVE_GENERIC
|
||||||
@ -60,7 +60,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic_fxpt(lv_32fc_t* o
|
|||||||
volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(out, phase_inc, phase, num_points);
|
volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(out, phase_inc, phase, num_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LV_HAVE_GENERIC */
|
#endif /* LV_HAVE_GENERIC */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
@ -70,7 +70,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_sse2(lv_32fc_t* out, co
|
|||||||
phase[0] = 3;
|
phase[0] = 3;
|
||||||
volk_gnsssdr_s32f_sincos_32fc_a_sse2(out, phase_inc, phase, num_points);
|
volk_gnsssdr_s32f_sincos_32fc_a_sse2(out, phase_inc, phase, num_points);
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_SSE2
|
#ifdef LV_HAVE_SSE2
|
||||||
@ -80,7 +80,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_sse2(lv_32fc_t* out, co
|
|||||||
phase[0] = 3;
|
phase[0] = 3;
|
||||||
volk_gnsssdr_s32f_sincos_32fc_u_sse2(out, phase_inc, phase, num_points);
|
volk_gnsssdr_s32f_sincos_32fc_u_sse2(out, phase_inc, phase, num_points);
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_SSE2 */
|
#endif /* LV_HAVE_SSE2 */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
@ -90,7 +90,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_avx2(lv_32fc_t* out, co
|
|||||||
phase[0] = 3;
|
phase[0] = 3;
|
||||||
volk_gnsssdr_s32f_sincos_32fc_a_avx2(out, phase_inc, phase, num_points);
|
volk_gnsssdr_s32f_sincos_32fc_a_avx2(out, phase_inc, phase, num_points);
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_AVX2 */
|
#endif /* LV_HAVE_AVX2 */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_AVX2
|
#ifdef LV_HAVE_AVX2
|
||||||
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_avx2(lv_32fc_t* out, co
|
|||||||
phase[0] = 3;
|
phase[0] = 3;
|
||||||
volk_gnsssdr_s32f_sincos_32fc_u_avx2(out, phase_inc, phase, num_points);
|
volk_gnsssdr_s32f_sincos_32fc_u_avx2(out, phase_inc, phase, num_points);
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_AVX2 */
|
#endif /* LV_HAVE_AVX2 */
|
||||||
|
|
||||||
|
|
||||||
#ifdef LV_HAVE_NEON
|
#ifdef LV_HAVE_NEON
|
||||||
@ -110,6 +110,6 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_neon(lv_32fc_t* out, cons
|
|||||||
phase[0] = 3;
|
phase[0] = 3;
|
||||||
volk_gnsssdr_s32f_sincos_32fc_neon(out, phase_inc, phase, num_points);
|
volk_gnsssdr_s32f_sincos_32fc_neon(out, phase_inc, phase, num_points);
|
||||||
}
|
}
|
||||||
#endif /* LV_HAVE_NEON */
|
#endif /* LV_HAVE_NEON */
|
||||||
|
|
||||||
#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */
|
#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */
|
||||||
|
@ -38,32 +38,31 @@
|
|||||||
|
|
||||||
// for puppets we need to get all the func_variants for the puppet and just
|
// for puppets we need to get all the func_variants for the puppet and just
|
||||||
// keep track of the actual function name to write to results
|
// keep track of the actual function name to write to results
|
||||||
#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\
|
#define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \
|
||||||
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
|
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \
|
||||||
std::string(#puppet_master_func), test_params)
|
std::string(#puppet_master_func), test_params)
|
||||||
|
|
||||||
#define VOLK_INIT_TEST(func, test_params)\
|
#define VOLK_INIT_TEST(func, test_params) \
|
||||||
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
|
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \
|
||||||
test_params)
|
test_params)
|
||||||
|
|
||||||
#define QA(test) test_cases.push_back(test);
|
#define QA(test) test_cases.push_back(test);
|
||||||
|
|
||||||
std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t test_params)
|
std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t test_params)
|
||||||
{
|
{
|
||||||
|
|
||||||
// Some kernels need a lower tolerance
|
// Some kernels need a lower tolerance
|
||||||
volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(),
|
volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(),
|
||||||
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
|
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
|
||||||
volk_gnsssdr_test_params_t test_params_int1 = volk_gnsssdr_test_params_t(1, test_params.scalar(),
|
volk_gnsssdr_test_params_t test_params_int1 = volk_gnsssdr_test_params_t(1, test_params.scalar(),
|
||||||
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
|
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
|
||||||
// some others need more iterations ***** ADDED BY GNSS-SDR
|
// some others need more iterations ***** ADDED BY GNSS-SDR
|
||||||
volk_gnsssdr_test_params_t test_params_more_iters = volk_gnsssdr_test_params_t(test_params.tol(), test_params.scalar(),
|
volk_gnsssdr_test_params_t test_params_more_iters = volk_gnsssdr_test_params_t(test_params.tol(), test_params.scalar(),
|
||||||
test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex());
|
test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex());
|
||||||
// ... or more tolerance ***** ADDED BY GNSS-SDR
|
// ... or more tolerance ***** ADDED BY GNSS-SDR
|
||||||
volk_gnsssdr_test_params_t test_params_int16 = volk_gnsssdr_test_params_t(16, test_params.scalar(),
|
volk_gnsssdr_test_params_t test_params_int16 = volk_gnsssdr_test_params_t(16, test_params.scalar(),
|
||||||
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
|
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
|
||||||
volk_gnsssdr_test_params_t test_params_inacc2 = volk_gnsssdr_test_params_t(2e-1, test_params.scalar(),
|
volk_gnsssdr_test_params_t test_params_inacc2 = volk_gnsssdr_test_params_t(2e-1, test_params.scalar(),
|
||||||
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
|
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
|
||||||
|
|
||||||
std::vector<volk_gnsssdr_test_case_t> test_cases;
|
std::vector<volk_gnsssdr_test_case_t> test_cases;
|
||||||
|
|
||||||
@ -98,8 +97,7 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
|
|||||||
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16))
|
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16))
|
||||||
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16))
|
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16))
|
||||||
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1))
|
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1))
|
||||||
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1))
|
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1));
|
||||||
;
|
|
||||||
|
|
||||||
return test_cases;
|
return test_cases;
|
||||||
}
|
}
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -25,17 +25,18 @@
|
|||||||
|
|
||||||
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
|
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
|
||||||
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t
|
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t
|
||||||
#include <cstdbool> // for bool, false
|
#include <cstdbool> // for bool, false
|
||||||
#include <cstdlib> // for NULL
|
#include <cstdlib> // for NULL
|
||||||
#include <map> // for map
|
#include <map> // for map
|
||||||
#include <string> // for string, basic_string
|
#include <string> // for string, basic_string
|
||||||
#include <vector> // for vector
|
#include <vector> // for vector
|
||||||
|
|
||||||
|
|
||||||
/************************************************
|
/************************************************
|
||||||
* VOLK QA type definitions *
|
* VOLK QA type definitions *
|
||||||
************************************************/
|
************************************************/
|
||||||
struct volk_gnsssdr_type_t {
|
struct volk_gnsssdr_type_t
|
||||||
|
{
|
||||||
bool is_float;
|
bool is_float;
|
||||||
bool is_scalar;
|
bool is_scalar;
|
||||||
bool is_signed;
|
bool is_signed;
|
||||||
@ -44,80 +45,78 @@ struct volk_gnsssdr_type_t {
|
|||||||
std::string str;
|
std::string str;
|
||||||
};
|
};
|
||||||
|
|
||||||
class volk_gnsssdr_test_time_t {
|
class volk_gnsssdr_test_time_t
|
||||||
public:
|
{
|
||||||
std::string name;
|
public:
|
||||||
double time;
|
std::string name;
|
||||||
std::string units;
|
double time;
|
||||||
bool pass;
|
std::string units;
|
||||||
|
bool pass;
|
||||||
};
|
};
|
||||||
|
|
||||||
class volk_gnsssdr_test_results_t {
|
class volk_gnsssdr_test_results_t
|
||||||
public:
|
{
|
||||||
std::string name;
|
public:
|
||||||
std::string config_name;
|
std::string name;
|
||||||
unsigned int vlen;
|
std::string config_name;
|
||||||
unsigned int iter;
|
unsigned int vlen;
|
||||||
std::map<std::string, volk_gnsssdr_test_time_t> results;
|
unsigned int iter;
|
||||||
std::string best_arch_a;
|
std::map<std::string, volk_gnsssdr_test_time_t> results;
|
||||||
std::string best_arch_u;
|
std::string best_arch_a;
|
||||||
|
std::string best_arch_u;
|
||||||
};
|
};
|
||||||
|
|
||||||
class volk_gnsssdr_test_params_t {
|
class volk_gnsssdr_test_params_t
|
||||||
private:
|
{
|
||||||
float _tol;
|
private:
|
||||||
lv_32fc_t _scalar;
|
float _tol;
|
||||||
unsigned int _vlen;
|
lv_32fc_t _scalar;
|
||||||
unsigned int _iter;
|
unsigned int _vlen;
|
||||||
bool _benchmark_mode;
|
unsigned int _iter;
|
||||||
std::string _kernel_regex;
|
bool _benchmark_mode;
|
||||||
public:
|
std::string _kernel_regex;
|
||||||
// ctor
|
|
||||||
volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
|
public:
|
||||||
bool benchmark_mode, std::string kernel_regex) :
|
// ctor
|
||||||
_tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter),
|
volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
|
||||||
_benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex) {};
|
bool benchmark_mode, std::string kernel_regex) : _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex){};
|
||||||
// setters
|
// setters
|
||||||
void set_tol(float tol) {_tol=tol;};
|
void set_tol(float tol) { _tol = tol; };
|
||||||
void set_scalar(lv_32fc_t scalar) {_scalar=scalar;};
|
void set_scalar(lv_32fc_t scalar) { _scalar = scalar; };
|
||||||
void set_vlen(unsigned int vlen) {_vlen=vlen;};
|
void set_vlen(unsigned int vlen) { _vlen = vlen; };
|
||||||
void set_iter(unsigned int iter) {_iter=iter;};
|
void set_iter(unsigned int iter) { _iter = iter; };
|
||||||
void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;};
|
void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; };
|
||||||
void set_regex(std::string regex) {_kernel_regex=regex;};
|
void set_regex(std::string regex) { _kernel_regex = regex; };
|
||||||
// getters
|
// getters
|
||||||
float tol() {return _tol;};
|
float tol() { return _tol; };
|
||||||
lv_32fc_t scalar() {return _scalar;};
|
lv_32fc_t scalar() { return _scalar; };
|
||||||
unsigned int vlen() {return _vlen;};
|
unsigned int vlen() { return _vlen; };
|
||||||
unsigned int iter() {return _iter;};
|
unsigned int iter() { return _iter; };
|
||||||
bool benchmark_mode() {return _benchmark_mode;};
|
bool benchmark_mode() { return _benchmark_mode; };
|
||||||
std::string kernel_regex() {return _kernel_regex;};
|
std::string kernel_regex() { return _kernel_regex; };
|
||||||
};
|
};
|
||||||
|
|
||||||
class volk_gnsssdr_test_case_t {
|
class volk_gnsssdr_test_case_t
|
||||||
private:
|
{
|
||||||
volk_gnsssdr_func_desc_t _desc;
|
private:
|
||||||
void(*_kernel_ptr)();
|
volk_gnsssdr_func_desc_t _desc;
|
||||||
std::string _name;
|
void (*_kernel_ptr)();
|
||||||
volk_gnsssdr_test_params_t _test_parameters;
|
std::string _name;
|
||||||
std::string _puppet_master_name;
|
volk_gnsssdr_test_params_t _test_parameters;
|
||||||
public:
|
std::string _puppet_master_name;
|
||||||
volk_gnsssdr_func_desc_t desc() {return _desc;};
|
|
||||||
void (*kernel_ptr()) () {return _kernel_ptr;};
|
public:
|
||||||
std::string name() {return _name;};
|
volk_gnsssdr_func_desc_t desc() { return _desc; };
|
||||||
std::string puppet_master_name() {return _puppet_master_name;};
|
void (*kernel_ptr())() { return _kernel_ptr; };
|
||||||
volk_gnsssdr_test_params_t test_parameters() {return _test_parameters;};
|
std::string name() { return _name; };
|
||||||
// normal ctor
|
std::string puppet_master_name() { return _puppet_master_name; };
|
||||||
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name,
|
volk_gnsssdr_test_params_t test_parameters() { return _test_parameters; };
|
||||||
volk_gnsssdr_test_params_t test_parameters) :
|
// normal ctor
|
||||||
_desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
|
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name,
|
||||||
_puppet_master_name("NULL")
|
volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name("NULL"){};
|
||||||
{};
|
// ctor for puppets
|
||||||
// ctor for puppets
|
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name,
|
||||||
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name,
|
std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name(puppet_master_name){};
|
||||||
std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) :
|
|
||||||
_desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
|
|
||||||
_puppet_master_name(puppet_master_name)
|
|
||||||
{};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/************************************************
|
/************************************************
|
||||||
@ -130,58 +129,57 @@ void random_floats(float *buf, unsigned n);
|
|||||||
|
|
||||||
bool run_volk_gnsssdr_tests(
|
bool run_volk_gnsssdr_tests(
|
||||||
volk_gnsssdr_func_desc_t,
|
volk_gnsssdr_func_desc_t,
|
||||||
void(*)(),
|
void (*)(),
|
||||||
std::string,
|
std::string,
|
||||||
volk_gnsssdr_test_params_t,
|
volk_gnsssdr_test_params_t,
|
||||||
std::vector<volk_gnsssdr_test_results_t> *results = NULL,
|
std::vector<volk_gnsssdr_test_results_t> *results = NULL,
|
||||||
std::string puppet_master_name = "NULL"
|
std::string puppet_master_name = "NULL");
|
||||||
);
|
|
||||||
|
|
||||||
bool run_volk_gnsssdr_tests(
|
bool run_volk_gnsssdr_tests(
|
||||||
volk_gnsssdr_func_desc_t,
|
volk_gnsssdr_func_desc_t,
|
||||||
void(*)(),
|
void (*)(),
|
||||||
std::string,
|
std::string,
|
||||||
float,
|
float,
|
||||||
lv_32fc_t,
|
lv_32fc_t,
|
||||||
unsigned int,
|
unsigned int,
|
||||||
unsigned int,
|
unsigned int,
|
||||||
std::vector<volk_gnsssdr_test_results_t> *results = NULL,
|
std::vector<volk_gnsssdr_test_results_t> *results = NULL,
|
||||||
std::string puppet_master_name = "NULL",
|
std::string puppet_master_name = "NULL",
|
||||||
bool benchmark_mode = false
|
bool benchmark_mode = false);
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
|
#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
|
||||||
BOOST_AUTO_TEST_CASE(func##_test) { \
|
BOOST_AUTO_TEST_CASE(func##_test) \
|
||||||
BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
|
{ \
|
||||||
func##_get_func_desc(), (void (*)())func##_manual, \
|
BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
|
||||||
std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
|
func##_get_func_desc(), (void (*)())func##_manual, \
|
||||||
0); \
|
std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
|
||||||
|
0); \
|
||||||
}
|
}
|
||||||
#define VOLK_PROFILE(func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL")
|
#define VOLK_PROFILE(func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL")
|
||||||
#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func))
|
#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func))
|
||||||
typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
|
typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char *); //one input, operate in place
|
||||||
typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
|
typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char *); //one input vector, one scalar float input
|
||||||
typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
|
typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char *); //one input vector, one scalar float input
|
||||||
typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char *);
|
||||||
|
|
||||||
//ADDED BY GNSS-SDR. START
|
//ADDED BY GNSS-SDR. START
|
||||||
typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input
|
typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char *); //one input vector, one scalar char input
|
||||||
typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input
|
typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char *); //one input vector, one scalar lv_8sc_t vector input
|
||||||
typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char*); //one input vector, one scalar lv_16sc_t vector input
|
typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char *); //one input vector, one scalar lv_16sc_t vector input
|
||||||
typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char *);
|
||||||
typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char*);
|
typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char *);
|
||||||
//ADDED BY GNSS-SDR. END
|
//ADDED BY GNSS-SDR. END
|
||||||
|
|
||||||
|
|
||||||
#endif // GNSS_SDR_VOLK_QA_UTILS_H
|
#endif // GNSS_SDR_VOLK_QA_UTILS_H
|
||||||
|
@ -18,16 +18,16 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
#include "kernel_tests.h" // for init_test_list
|
#include "kernel_tests.h" // for init_test_list
|
||||||
#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t
|
#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t
|
||||||
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
|
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
|
||||||
#include <cstdbool> // for bool, false, true
|
#include <cstdbool> // for bool, false, true
|
||||||
#include <iostream> // for operator<<, basic_ostream, endl, char...
|
#include <iostream> // for operator<<, basic_ostream, endl, char...
|
||||||
#include <fstream> // IWYU pragma: keep
|
#include <fstream> // IWYU pragma: keep
|
||||||
#include <map> // for map, map<>::iterator, _Rb_tree_iterator
|
#include <map> // for map, map<>::iterator, _Rb_tree_iterator
|
||||||
#include <string> // for string, operator<<
|
#include <string> // for string, operator<<
|
||||||
#include <utility> // for pair
|
#include <utility> // for pair
|
||||||
#include <vector> // for vector
|
#include <vector> // for vector
|
||||||
|
|
||||||
void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int nfails);
|
void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int nfails);
|
||||||
|
|
||||||
@ -49,38 +49,44 @@ int main()
|
|||||||
std::vector<std::string> qa_failures;
|
std::vector<std::string> qa_failures;
|
||||||
std::vector<volk_gnsssdr_test_results_t> results;
|
std::vector<volk_gnsssdr_test_results_t> results;
|
||||||
// Test every kernel reporting failures when they occur
|
// Test every kernel reporting failures when they occur
|
||||||
for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
|
for (unsigned int ii = 0; ii < test_cases.size(); ++ii)
|
||||||
bool qa_result = false;
|
{
|
||||||
volk_gnsssdr_test_case_t test_case = test_cases[ii];
|
bool qa_result = false;
|
||||||
try {
|
volk_gnsssdr_test_case_t test_case = test_cases[ii];
|
||||||
qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
|
try
|
||||||
test_case.test_parameters(), &results, test_case.puppet_master_name());
|
{
|
||||||
}
|
qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
|
||||||
catch(...) {
|
test_case.test_parameters(), &results, test_case.puppet_master_name());
|
||||||
// TODO: what exceptions might we need to catch and how do we handle them?
|
}
|
||||||
std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
|
catch (...)
|
||||||
qa_result = false;
|
{
|
||||||
}
|
// TODO: what exceptions might we need to catch and how do we handle them?
|
||||||
|
std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
|
||||||
|
qa_result = false;
|
||||||
|
}
|
||||||
|
|
||||||
if(qa_result) {
|
if (qa_result)
|
||||||
std::cerr << "Failure on " << test_case.name() << std::endl;
|
{
|
||||||
qa_failures.push_back(test_case.name());
|
std::cerr << "Failure on " << test_case.name() << std::endl;
|
||||||
|
qa_failures.push_back(test_case.name());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Generate XML results
|
// Generate XML results
|
||||||
print_qa_xml(results, qa_failures.size());
|
print_qa_xml(results, qa_failures.size());
|
||||||
|
|
||||||
// Summarize QA results
|
// Summarize QA results
|
||||||
std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
|
std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
|
||||||
<< test_cases.size() << " tests." << std::endl;
|
<< test_cases.size() << " tests." << std::endl;
|
||||||
if(qa_failures.size() > 0) {
|
if (qa_failures.size() > 0)
|
||||||
std::cerr << "The following kernels failed QA:" << std::endl;
|
{
|
||||||
for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
|
std::cerr << "The following kernels failed QA:" << std::endl;
|
||||||
std::cerr << " " << qa_failures[ii] << std::endl;
|
for (unsigned int ii = 0; ii < qa_failures.size(); ++ii)
|
||||||
|
{
|
||||||
|
std::cerr << " " << qa_failures[ii] << std::endl;
|
||||||
|
}
|
||||||
|
qa_ret_val = 1;
|
||||||
}
|
}
|
||||||
qa_ret_val = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
return qa_ret_val;
|
return qa_ret_val;
|
||||||
}
|
}
|
||||||
@ -95,34 +101,34 @@ void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int
|
|||||||
qa_file.open(".unittest/kernels.xml");
|
qa_file.open(".unittest/kernels.xml");
|
||||||
|
|
||||||
qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
|
qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
|
||||||
qa_file << "<testsuites name=\"kernels\" " <<
|
qa_file << "<testsuites name=\"kernels\" "
|
||||||
"tests=\"" << results.size() << "\" " <<
|
<< "tests=\"" << results.size() << "\" "
|
||||||
"failures=\"" << nfails << "\" id=\"1\">" << std::endl;
|
<< "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
|
||||||
|
|
||||||
// Results are in a vector by kernel. Each element has a result
|
// Results are in a vector by kernel. Each element has a result
|
||||||
// map containing time and arch name with test result
|
// map containing time and arch name with test result
|
||||||
for(unsigned int ii=0; ii < results.size(); ++ii) {
|
for (unsigned int ii = 0; ii < results.size(); ++ii)
|
||||||
volk_gnsssdr_test_results_t result = results[ii];
|
{
|
||||||
qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
|
volk_gnsssdr_test_results_t result = results[ii];
|
||||||
|
qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
|
||||||
|
|
||||||
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
|
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
|
||||||
for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) {
|
for (kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair)
|
||||||
volk_gnsssdr_test_time_t test_time = kernel_time_pair->second;
|
{
|
||||||
qa_file << " <testcase name=\"" << test_time.name << "\" " <<
|
volk_gnsssdr_test_time_t test_time = kernel_time_pair->second;
|
||||||
"classname=\"" << result.name << "\" " <<
|
qa_file << " <testcase name=\"" << test_time.name << "\" "
|
||||||
"time=\"" << test_time.time << "\">" << std::endl;
|
<< "classname=\"" << result.name << "\" "
|
||||||
if(!test_time.pass)
|
<< "time=\"" << test_time.time << "\">" << std::endl;
|
||||||
qa_file << " <failure " <<
|
if (!test_time.pass)
|
||||||
"message=\"fail on arch " << test_time.name << "\">" <<
|
qa_file << " <failure "
|
||||||
"</failure>" << std::endl;
|
<< "message=\"fail on arch " << test_time.name << "\">"
|
||||||
qa_file << " </testcase>" << std::endl;
|
<< "</failure>" << std::endl;
|
||||||
|
qa_file << " </testcase>" << std::endl;
|
||||||
|
}
|
||||||
|
qa_file << " </testsuite>" << std::endl;
|
||||||
}
|
}
|
||||||
qa_file << " </testsuite>" << std::endl;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
qa_file << "</testsuites>" << std::endl;
|
qa_file << "</testsuites>" << std::endl;
|
||||||
qa_file.close();
|
qa_file.close();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,15 +43,16 @@ void *volk_gnsssdr_malloc(size_t size, size_t alignment)
|
|||||||
return malloc(size);
|
return malloc(size);
|
||||||
|
|
||||||
int err = posix_memalign(&ptr, alignment, size);
|
int err = posix_memalign(&ptr, alignment, size);
|
||||||
if(err == 0)
|
if (err == 0)
|
||||||
{
|
{
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"VOLK_GNSSSDR: Error allocating memory "
|
"VOLK_GNSSSDR: Error allocating memory "
|
||||||
"(posix_memalign: error %d: %s)\n", err, strerror(err));
|
"(posix_memalign: error %d: %s)\n",
|
||||||
|
err, strerror(err));
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -68,7 +69,7 @@ void volk_gnsssdr_free(void *ptr)
|
|||||||
void *volk_gnsssdr_malloc(size_t size, size_t alignment)
|
void *volk_gnsssdr_malloc(size_t size, size_t alignment)
|
||||||
{
|
{
|
||||||
void *ptr = _aligned_malloc(size, alignment);
|
void *ptr = _aligned_malloc(size, alignment);
|
||||||
if(ptr == NULL)
|
if (ptr == NULL)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "VOLK_GNSSSDR: Error allocating memory (_aligned_malloc)\n");
|
fprintf(stderr, "VOLK_GNSSSDR: Error allocating memory (_aligned_malloc)\n");
|
||||||
}
|
}
|
||||||
@ -81,7 +82,7 @@ void volk_gnsssdr_free(void *ptr)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// No standard handlers; we'll do it ourselves.
|
// No standard handlers; we'll do it ourselves.
|
||||||
#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
|
#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
|
||||||
|
|
||||||
struct block_info
|
struct block_info
|
||||||
{
|
{
|
||||||
@ -102,7 +103,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment)
|
|||||||
real = malloc(size + (2 * alignment - 1));
|
real = malloc(size + (2 * alignment - 1));
|
||||||
|
|
||||||
/* Get pointer to the various zones */
|
/* Get pointer to the various zones */
|
||||||
user = (void *)((((uintptr_t) real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1));
|
user = (void *)((((uintptr_t)real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1));
|
||||||
info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info));
|
info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info));
|
||||||
|
|
||||||
/* Store the info for the free */
|
/* Store the info for the free */
|
||||||
@ -112,8 +113,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment)
|
|||||||
return user;
|
return user;
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void volk_gnsssdr_free(void *ptr)
|
||||||
volk_gnsssdr_free(void *ptr)
|
|
||||||
{
|
{
|
||||||
struct block_info *info;
|
struct block_info *info;
|
||||||
|
|
||||||
@ -124,6 +124,6 @@ volk_gnsssdr_free(void *ptr)
|
|||||||
free(info->real);
|
free(info->real);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
|
#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
|
||||||
|
|
||||||
//#endif // _ISOC11_SOURCE
|
//#endif // _ISOC11_SOURCE
|
||||||
|
@ -26,16 +26,17 @@ void volk_gnsssdr_get_config_path(char *path)
|
|||||||
{
|
{
|
||||||
if (!path) return;
|
if (!path) return;
|
||||||
const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config";
|
const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config";
|
||||||
const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; //non-hidden
|
const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; // non-hidden
|
||||||
char *home = NULL;
|
char *home = NULL;
|
||||||
|
|
||||||
//allows config redirection via env variable
|
//allows config redirection via env variable
|
||||||
home = getenv("VOLK_CONFIGPATH");
|
home = getenv("VOLK_CONFIGPATH");
|
||||||
if(home!=NULL){
|
if (home != NULL)
|
||||||
strncpy(path,home,512);
|
{
|
||||||
strcat(path,suffix2);
|
strncpy(path, home, 512);
|
||||||
return;
|
strcat(path, suffix2);
|
||||||
}
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (home == NULL) home = getenv("HOME");
|
if (home == NULL) home = getenv("HOME");
|
||||||
if (home == NULL) home = getenv("APPDATA");
|
if (home == NULL) home = getenv("APPDATA");
|
||||||
@ -57,16 +58,16 @@ size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **prefs_res)
|
|||||||
|
|
||||||
//get the config path
|
//get the config path
|
||||||
volk_gnsssdr_get_config_path(path);
|
volk_gnsssdr_get_config_path(path);
|
||||||
if (!path[0]) return n_arch_prefs; //no prefs found
|
if (!path[0]) return n_arch_prefs; //no prefs found
|
||||||
config_file = fopen(path, "r");
|
config_file = fopen(path, "r");
|
||||||
if(!config_file) return n_arch_prefs; //no prefs found
|
if (!config_file) return n_arch_prefs; //no prefs found
|
||||||
|
|
||||||
//reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs
|
//reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs
|
||||||
while(fgets(line, sizeof(line), config_file) != NULL)
|
while (fgets(line, sizeof(line), config_file) != NULL)
|
||||||
{
|
{
|
||||||
prefs = (volk_gnsssdr_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs));
|
prefs = (volk_gnsssdr_arch_pref_t *)realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
|
||||||
volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs;
|
volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs;
|
||||||
if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5))
|
if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5))
|
||||||
{
|
{
|
||||||
n_arch_prefs++;
|
n_arch_prefs++;
|
||||||
}
|
}
|
||||||
|
@ -29,7 +29,7 @@
|
|||||||
inline unsigned __popcnt(unsigned num)
|
inline unsigned __popcnt(unsigned num)
|
||||||
{
|
{
|
||||||
unsigned pop = 0;
|
unsigned pop = 0;
|
||||||
while(num)
|
while (num)
|
||||||
{
|
{
|
||||||
if (num & 0x1) pop++;
|
if (num & 0x1) pop++;
|
||||||
num >>= 1;
|
num >>= 1;
|
||||||
@ -39,15 +39,15 @@ inline unsigned __popcnt(unsigned num)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
int volk_gnsssdr_get_index(
|
int volk_gnsssdr_get_index(
|
||||||
const char *impl_names[], //list of implementations by name
|
const char *impl_names[], //list of implementations by name
|
||||||
const size_t n_impls, //number of implementations available
|
const size_t n_impls, //number of implementations available
|
||||||
const char *impl_name //the implementation name to find
|
const char *impl_name //the implementation name to find
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for (i = 0; i < n_impls; i++)
|
for (i = 0; i < n_impls; i++)
|
||||||
{
|
{
|
||||||
if(!strncmp(impl_names[i], impl_name, 20))
|
if (!strncmp(impl_names[i], impl_name, 20))
|
||||||
{
|
{
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
@ -55,24 +55,24 @@ int volk_gnsssdr_get_index(
|
|||||||
//TODO return -1;
|
//TODO return -1;
|
||||||
//something terrible should happen here
|
//something terrible should happen here
|
||||||
fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n");
|
fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n");
|
||||||
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
|
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int volk_gnsssdr_rank_archs(
|
int volk_gnsssdr_rank_archs(
|
||||||
const char *kern_name, //name of the kernel to rank
|
const char *kern_name, //name of the kernel to rank
|
||||||
const char *impl_names[], //list of implementations by name
|
const char *impl_names[], //list of implementations by name
|
||||||
const int* impl_deps, //requirement mask per implementation
|
const int *impl_deps, //requirement mask per implementation
|
||||||
const bool* alignment, //alignment status of each implementation
|
const bool *alignment, //alignment status of each implementation
|
||||||
size_t n_impls, //number of implementations available
|
size_t n_impls, //number of implementations available
|
||||||
const bool align //if false, filter aligned implementations
|
const bool align //if false, filter aligned implementations
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
size_t i;
|
size_t i;
|
||||||
static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs;
|
static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs;
|
||||||
static size_t n_arch_prefs = 0;
|
static size_t n_arch_prefs = 0;
|
||||||
static int prefs_loaded = 0;
|
static int prefs_loaded = 0;
|
||||||
if(!prefs_loaded)
|
if (!prefs_loaded)
|
||||||
{
|
{
|
||||||
n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs);
|
n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs);
|
||||||
prefs_loaded = 1;
|
prefs_loaded = 1;
|
||||||
@ -81,17 +81,17 @@ int volk_gnsssdr_rank_archs(
|
|||||||
// If we've defined VOLK_GENERIC to be anything, always return the
|
// If we've defined VOLK_GENERIC to be anything, always return the
|
||||||
// 'generic' kernel. Used in GR's QA code.
|
// 'generic' kernel. Used in GR's QA code.
|
||||||
char *gen_env = getenv("VOLK_GENERIC");
|
char *gen_env = getenv("VOLK_GENERIC");
|
||||||
if(gen_env)
|
if (gen_env)
|
||||||
{
|
{
|
||||||
return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
|
return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
|
||||||
}
|
}
|
||||||
|
|
||||||
//now look for the function name in the prefs list
|
//now look for the function name in the prefs list
|
||||||
for(i = 0; i < n_arch_prefs; i++)
|
for (i = 0; i < n_arch_prefs; i++)
|
||||||
{
|
{
|
||||||
if(!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it
|
if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it
|
||||||
{
|
{
|
||||||
const char *impl_name = align? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u;
|
const char *impl_name = align ? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u;
|
||||||
return volk_gnsssdr_get_index(impl_names, n_impls, impl_name);
|
return volk_gnsssdr_get_index(impl_names, n_impls, impl_name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -101,7 +101,7 @@ int volk_gnsssdr_rank_archs(
|
|||||||
size_t best_index_u = 0;
|
size_t best_index_u = 0;
|
||||||
int best_value_a = -1;
|
int best_value_a = -1;
|
||||||
int best_value_u = -1;
|
int best_value_u = -1;
|
||||||
for(i = 0; i < n_impls; i++)
|
for (i = 0; i < n_impls; i++)
|
||||||
{
|
{
|
||||||
const signed val = __popcnt(impl_deps[i]);
|
const signed val = __popcnt(impl_deps[i]);
|
||||||
if (alignment[i] && val > best_value_a)
|
if (alignment[i] && val > best_value_a)
|
||||||
|
@ -23,23 +23,24 @@
|
|||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int volk_gnsssdr_get_index(
|
int volk_gnsssdr_get_index(
|
||||||
const char *impl_names[], //list of implementations by name
|
const char *impl_names[], //list of implementations by name
|
||||||
const size_t n_impls, //number of implementations available
|
const size_t n_impls, //number of implementations available
|
||||||
const char *impl_name //the implementation name to find
|
const char *impl_name //the implementation name to find
|
||||||
);
|
);
|
||||||
|
|
||||||
int volk_gnsssdr_rank_archs(
|
int volk_gnsssdr_rank_archs(
|
||||||
const char *kern_name, //name of the kernel to rank
|
const char *kern_name, //name of the kernel to rank
|
||||||
const char *impl_names[], //list of implementations by name
|
const char *impl_names[], //list of implementations by name
|
||||||
const int* impl_deps, //requirement mask per implementation
|
const int *impl_deps, //requirement mask per implementation
|
||||||
const bool* alignment, //alignment status of each implementation
|
const bool *alignment, //alignment status of each implementation
|
||||||
size_t n_impls, //number of implementations available
|
size_t n_impls, //number of implementations available
|
||||||
const bool align //if false, filter aligned implementations
|
const bool align //if false, filter aligned implementations
|
||||||
);
|
);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
@ -31,80 +31,90 @@ static intptr_t __alignment_mask = 0;
|
|||||||
|
|
||||||
struct volk_gnsssdr_machine *get_machine(void)
|
struct volk_gnsssdr_machine *get_machine(void)
|
||||||
{
|
{
|
||||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||||
extern unsigned int n_volk_gnsssdr_machines;
|
extern unsigned int n_volk_gnsssdr_machines;
|
||||||
static struct volk_gnsssdr_machine *machine = NULL;
|
static struct volk_gnsssdr_machine *machine = NULL;
|
||||||
|
|
||||||
if(machine != NULL)
|
if (machine != NULL)
|
||||||
return machine;
|
return machine;
|
||||||
else {
|
else
|
||||||
unsigned int max_score = 0;
|
{
|
||||||
unsigned int i;
|
unsigned int max_score = 0;
|
||||||
struct volk_gnsssdr_machine *max_machine = NULL;
|
unsigned int i;
|
||||||
for(i=0; i<n_volk_gnsssdr_machines; i++) {
|
struct volk_gnsssdr_machine *max_machine = NULL;
|
||||||
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
|
for (i = 0; i < n_volk_gnsssdr_machines; i++)
|
||||||
if(volk_gnsssdr_machines[i]->caps > max_score) {
|
{
|
||||||
max_score = volk_gnsssdr_machines[i]->caps;
|
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
|
||||||
max_machine = volk_gnsssdr_machines[i];
|
{
|
||||||
|
if (volk_gnsssdr_machines[i]->caps > max_score)
|
||||||
|
{
|
||||||
|
max_score = volk_gnsssdr_machines[i]->caps;
|
||||||
|
max_machine = volk_gnsssdr_machines[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
machine = max_machine;
|
||||||
|
//printf("Using Volk machine: %s\n", machine->name);
|
||||||
|
__alignment = machine->alignment;
|
||||||
|
__alignment_mask = (intptr_t)(__alignment - 1);
|
||||||
|
return machine;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
machine = max_machine;
|
|
||||||
//printf("Using Volk machine: %s\n", machine->name);
|
|
||||||
__alignment = machine->alignment;
|
|
||||||
__alignment_mask = (intptr_t)(__alignment-1);
|
|
||||||
return machine;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void volk_gnsssdr_list_machines(void)
|
void volk_gnsssdr_list_machines(void)
|
||||||
{
|
{
|
||||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||||
extern unsigned int n_volk_gnsssdr_machines;
|
extern unsigned int n_volk_gnsssdr_machines;
|
||||||
|
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
for(i=0; i<n_volk_gnsssdr_machines; i++) {
|
for (i = 0; i < n_volk_gnsssdr_machines; i++)
|
||||||
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
|
{
|
||||||
printf("%s;", volk_gnsssdr_machines[i]->name);
|
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
|
||||||
}
|
{
|
||||||
}
|
printf("%s;", volk_gnsssdr_machines[i]->name);
|
||||||
printf("\n");
|
}
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* volk_gnsssdr_get_machine(void)
|
const char *volk_gnsssdr_get_machine(void)
|
||||||
{
|
{
|
||||||
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
|
||||||
extern unsigned int n_volk_gnsssdr_machines;
|
extern unsigned int n_volk_gnsssdr_machines;
|
||||||
static struct volk_gnsssdr_machine *machine = NULL;
|
static struct volk_gnsssdr_machine *machine = NULL;
|
||||||
|
|
||||||
if(machine != NULL)
|
if (machine != NULL)
|
||||||
return machine->name;
|
return machine->name;
|
||||||
else {
|
else
|
||||||
unsigned int max_score = 0;
|
{
|
||||||
unsigned int i;
|
unsigned int max_score = 0;
|
||||||
struct volk_gnsssdr_machine *max_machine = NULL;
|
unsigned int i;
|
||||||
for(i=0; i<n_volk_gnsssdr_machines; i++) {
|
struct volk_gnsssdr_machine *max_machine = NULL;
|
||||||
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
|
for (i = 0; i < n_volk_gnsssdr_machines; i++)
|
||||||
if(volk_gnsssdr_machines[i]->caps > max_score) {
|
{
|
||||||
max_score = volk_gnsssdr_machines[i]->caps;
|
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
|
||||||
max_machine = volk_gnsssdr_machines[i];
|
{
|
||||||
|
if (volk_gnsssdr_machines[i]->caps > max_score)
|
||||||
|
{
|
||||||
|
max_score = volk_gnsssdr_machines[i]->caps;
|
||||||
|
max_machine = volk_gnsssdr_machines[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
machine = max_machine;
|
||||||
|
return machine->name;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
machine = max_machine;
|
|
||||||
return machine->name;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t volk_gnsssdr_get_alignment(void)
|
size_t volk_gnsssdr_get_alignment(void)
|
||||||
{
|
{
|
||||||
get_machine(); //ensures alignment is set
|
get_machine(); //ensures alignment is set
|
||||||
return __alignment;
|
return __alignment;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool volk_gnsssdr_is_aligned(const void *ptr)
|
bool volk_gnsssdr_is_aligned(const void *ptr)
|
||||||
{
|
{
|
||||||
return ((intptr_t)(ptr) & __alignment_mask) == 0;
|
return ((intptr_t)(ptr)&__alignment_mask) == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LV_HAVE_GENERIC
|
#define LV_HAVE_GENERIC
|
||||||
@ -113,13 +123,12 @@ bool volk_gnsssdr_is_aligned(const void *ptr)
|
|||||||
%for kern in kernels:
|
%for kern in kernels:
|
||||||
|
|
||||||
%if kern.has_dispatcher:
|
%if kern.has_dispatcher:
|
||||||
#include <volk_gnsssdr/${kern.name}.h> //pulls in the dispatcher
|
#include <volk_gnsssdr/${kern.name}.h> //pulls in the dispatcher
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
static inline void __${kern.name}_d(${kern.arglist_full})
|
static inline void __${kern.name}_d(${kern.arglist_full})
|
||||||
{
|
{
|
||||||
%if kern.has_dispatcher:
|
% if kern.has_dispatcher : ${kern.name} _dispatcher(${kern.arglist_names});
|
||||||
${kern.name}_dispatcher(${kern.arglist_names});
|
|
||||||
return;
|
return;
|
||||||
%endif
|
%endif
|
||||||
|
|
||||||
@ -131,41 +140,41 @@ static inline void __${kern.name}_d(${kern.arglist_full})
|
|||||||
%endfor
|
%endfor
|
||||||
0<% end_open_parens = ')'*num_open_parens %>${end_open_parens}
|
0<% end_open_parens = ')'*num_open_parens %>${end_open_parens}
|
||||||
)){
|
)){
|
||||||
${kern.name}_a(${kern.arglist_names});
|
${kern.name} _a(${kern.arglist_names});
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
${kern.name}_u(${kern.arglist_names});
|
${kern.name} _u(${kern.arglist_names});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __init_${kern.name}(void)
|
static inline void __init_${kern.name}(void)
|
||||||
{
|
{
|
||||||
const char *name = get_machine()->${kern.name}_name;
|
const char *name = get_machine()->${kern.name} _name;
|
||||||
const char **impl_names = get_machine()->${kern.name}_impl_names;
|
const char **impl_names = get_machine()->${kern.name} _impl_names;
|
||||||
const int *impl_deps = get_machine()->${kern.name}_impl_deps;
|
const int *impl_deps = get_machine()->${kern.name} _impl_deps;
|
||||||
const bool *alignment = get_machine()->${kern.name}_impl_alignment;
|
const bool *alignment = get_machine()->${kern.name} _impl_alignment;
|
||||||
const size_t n_impls = get_machine()->${kern.name}_n_impls;
|
const size_t n_impls = get_machine()->${kern.name} _n_impls;
|
||||||
const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/);
|
const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true /*aligned*/);
|
||||||
const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/);
|
const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false /*unaligned*/);
|
||||||
${kern.name}_a = get_machine()->${kern.name}_impls[index_a];
|
${kern.name} _a = get_machine()->${kern.name} _impls[index_a];
|
||||||
${kern.name}_u = get_machine()->${kern.name}_impls[index_u];
|
${kern.name} _u = get_machine()->${kern.name} _impls[index_u];
|
||||||
|
|
||||||
assert(${kern.name}_a);
|
assert(${kern.name} _a);
|
||||||
assert(${kern.name}_u);
|
assert(${kern.name} _u);
|
||||||
|
|
||||||
${kern.name} = &__${kern.name}_d;
|
${kern.name} = &__${kern.name} _d;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __${kern.name}_a(${kern.arglist_full})
|
static inline void __${kern.name} _a(${kern.arglist_full})
|
||||||
{
|
{
|
||||||
__init_${kern.name}();
|
__init_${kern.name}();
|
||||||
${kern.name}_a(${kern.arglist_names});
|
${kern.name} _a(${kern.arglist_names});
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __${kern.name}_u(${kern.arglist_full})
|
static inline void __${kern.name} _u(${kern.arglist_full})
|
||||||
{
|
{
|
||||||
__init_${kern.name}();
|
__init_${kern.name}();
|
||||||
${kern.name}_u(${kern.arglist_names});
|
${kern.name} _u(${kern.arglist_names});
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __${kern.name}(${kern.arglist_full})
|
static inline void __${kern.name}(${kern.arglist_full})
|
||||||
@ -174,34 +183,32 @@ static inline void __${kern.name}(${kern.arglist_full})
|
|||||||
${kern.name}(${kern.arglist_names});
|
${kern.name}(${kern.arglist_names});
|
||||||
}
|
}
|
||||||
|
|
||||||
${kern.pname} ${kern.name}_a = &__${kern.name}_a;
|
${kern.pname} ${kern.name} _a = &__${kern.name} _a;
|
||||||
${kern.pname} ${kern.name}_u = &__${kern.name}_u;
|
${kern.pname} ${kern.name} _u = &__${kern.name} _u;
|
||||||
${kern.pname} ${kern.name} = &__${kern.name};
|
${kern.pname} ${kern.name} = &__${kern.name};
|
||||||
|
|
||||||
void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name)
|
void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name)
|
||||||
{
|
{
|
||||||
const int index = volk_gnsssdr_get_index(
|
const int index = volk_gnsssdr_get_index(
|
||||||
get_machine()->${kern.name}_impl_names,
|
get_machine()->${kern.name} _impl_names,
|
||||||
get_machine()->${kern.name}_n_impls,
|
get_machine()->${kern.name} _n_impls,
|
||||||
impl_name
|
impl_name);
|
||||||
);
|
get_machine()->${kern.name} _impls[index](
|
||||||
get_machine()->${kern.name}_impls[index](
|
${kern.arglist_names});
|
||||||
${kern.arglist_names}
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) {
|
volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void)
|
||||||
const char **impl_names = get_machine()->${kern.name}_impl_names;
|
{
|
||||||
const int *impl_deps = get_machine()->${kern.name}_impl_deps;
|
const char **impl_names = get_machine()->${kern.name} _impl_names;
|
||||||
const bool *alignment = get_machine()->${kern.name}_impl_alignment;
|
const int *impl_deps = get_machine()->${kern.name} _impl_deps;
|
||||||
const size_t n_impls = get_machine()->${kern.name}_n_impls;
|
const bool *alignment = get_machine()->${kern.name} _impl_alignment;
|
||||||
|
const size_t n_impls = get_machine()->${kern.name} _n_impls;
|
||||||
volk_gnsssdr_func_desc_t desc = {
|
volk_gnsssdr_func_desc_t desc = {
|
||||||
impl_names,
|
impl_names,
|
||||||
impl_deps,
|
impl_deps,
|
||||||
alignment,
|
alignment,
|
||||||
n_impls
|
n_impls};
|
||||||
};
|
|
||||||
return desc;
|
return desc;
|
||||||
}
|
}
|
||||||
|
|
||||||
%endfor
|
% endfor
|
||||||
|
@ -42,7 +42,7 @@ typedef struct volk_gnsssdr_func_desc
|
|||||||
VOLK_API void volk_gnsssdr_list_machines(void);
|
VOLK_API void volk_gnsssdr_list_machines(void);
|
||||||
|
|
||||||
//! Returns the name of the machine this instance will use
|
//! Returns the name of the machine this instance will use
|
||||||
VOLK_API const char* volk_gnsssdr_get_machine(void);
|
VOLK_API const char *volk_gnsssdr_get_machine(void);
|
||||||
|
|
||||||
//! Get the machine alignment in bytes
|
//! Get the machine alignment in bytes
|
||||||
VOLK_API size_t volk_gnsssdr_get_alignment(void);
|
VOLK_API size_t volk_gnsssdr_get_alignment(void);
|
||||||
@ -74,19 +74,19 @@ VOLK_API bool volk_gnsssdr_is_aligned(const void *ptr);
|
|||||||
extern VOLK_API ${kern.pname} ${kern.name};
|
extern VOLK_API ${kern.pname} ${kern.name};
|
||||||
|
|
||||||
//! A function pointer to the fastest aligned implementation
|
//! A function pointer to the fastest aligned implementation
|
||||||
extern VOLK_API ${kern.pname} ${kern.name}_a;
|
extern VOLK_API ${kern.pname} ${kern.name} _a;
|
||||||
|
|
||||||
//! A function pointer to the fastest unaligned implementation
|
//! A function pointer to the fastest unaligned implementation
|
||||||
extern VOLK_API ${kern.pname} ${kern.name}_u;
|
extern VOLK_API ${kern.pname} ${kern.name} _u;
|
||||||
|
|
||||||
//! Call into a specific implementation given by name
|
//! Call into a specific implementation given by name
|
||||||
extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name);
|
extern VOLK_API void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name);
|
||||||
|
|
||||||
//! Get description parameters for this kernel
|
//! Get description parameters for this kernel
|
||||||
extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void);
|
extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void);
|
||||||
%endfor
|
% endfor
|
||||||
|
|
||||||
__VOLK_DECL_END
|
__VOLK_DECL_END
|
||||||
|
|
||||||
|
|
||||||
#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/
|
#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/
|
||||||
|
@ -21,7 +21,8 @@
|
|||||||
|
|
||||||
%for i, arch in enumerate(archs):
|
%for i, arch in enumerate(archs):
|
||||||
//#ifndef LV_${arch.name.upper()}
|
//#ifndef LV_${arch.name.upper()}
|
||||||
#define LV_${arch.name.upper()} ${i}
|
#define LV_$ \
|
||||||
|
{arch.name.upper()} $ { i }
|
||||||
//#endif
|
//#endif
|
||||||
%endfor
|
%endfor
|
||||||
|
|
||||||
|
@ -24,50 +24,54 @@
|
|||||||
struct VOLK_CPU volk_gnsssdr_cpu;
|
struct VOLK_CPU volk_gnsssdr_cpu;
|
||||||
|
|
||||||
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
|
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
|
||||||
#define VOLK_CPU_x86
|
#define VOLK_CPU_x86
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(VOLK_CPU_x86)
|
#if defined(VOLK_CPU_x86)
|
||||||
|
|
||||||
//implement get cpuid for gcc compilers using a system or local copy of cpuid.h
|
//implement get cpuid for gcc compilers using a system or local copy of cpuid.h
|
||||||
#if defined(__GNUC__)
|
#if defined(__GNUC__)
|
||||||
#include <cpuid.h>
|
#include <cpuid.h>
|
||||||
#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3)
|
#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r + 0, (unsigned int *)r + 1, (unsigned int *)r + 2, (unsigned int *)r + 3)
|
||||||
#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int*)regs), *((unsigned int*)regs+1), *((unsigned int*)regs+2), *((unsigned int*)regs+3))
|
#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int *)regs), *((unsigned int *)regs + 1), *((unsigned int *)regs + 2), *((unsigned int *)regs + 3))
|
||||||
|
|
||||||
/* Return Intel AVX extended CPU capabilities register.
|
/* Return Intel AVX extended CPU capabilities register.
|
||||||
* This function will bomb on non-AVX-capable machines, so
|
* This function will bomb on non-AVX-capable machines, so
|
||||||
* check for AVX capability before executing.
|
* check for AVX capability before executing.
|
||||||
*/
|
*/
|
||||||
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
|
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
|
||||||
static inline unsigned long long _xgetbv(unsigned int index){
|
static inline unsigned long long _xgetbv(unsigned int index)
|
||||||
unsigned int eax, edx;
|
{
|
||||||
__VOLK_ASM __VOLK_VOLATILE ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
|
unsigned int eax, edx;
|
||||||
return ((unsigned long long)edx << 32) | eax;
|
__VOLK_ASM __VOLK_VOLATILE("xgetbv"
|
||||||
}
|
: "=a"(eax), "=d"(edx)
|
||||||
#define __xgetbv() _xgetbv(0)
|
: "c"(index));
|
||||||
#else
|
return ((unsigned long long)edx << 32) | eax;
|
||||||
#define __xgetbv() 0
|
}
|
||||||
#endif
|
#define __xgetbv() _xgetbv(0)
|
||||||
|
#else
|
||||||
|
#define __xgetbv() 0
|
||||||
|
#endif
|
||||||
|
|
||||||
//implement get cpuid for MSVC compilers using __cpuid intrinsic
|
//implement get cpuid for MSVC compilers using __cpuid intrinsic
|
||||||
#elif defined(_MSC_VER) && defined(HAVE_INTRIN_H)
|
#elif defined(_MSC_VER) && defined(HAVE_INTRIN_H)
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#define cpuid_x86(op, r) __cpuid(((int*)r), op)
|
#define cpuid_x86(op, r) __cpuid(((int *)r), op)
|
||||||
|
|
||||||
#if defined(_XCR_XFEATURE_ENABLED_MASK)
|
#if defined(_XCR_XFEATURE_ENABLED_MASK)
|
||||||
#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
|
#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
|
||||||
#else
|
#else
|
||||||
#define __xgetbv() 0
|
#define __xgetbv() 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#error "A get cpuid for volk_gnsssdr is not available on this compiler..."
|
#error "A get cpuid for volk_gnsssdr is not available on this compiler..."
|
||||||
#endif //defined(__GNUC__)
|
#endif //defined(__GNUC__)
|
||||||
|
|
||||||
#endif //defined(VOLK_CPU_x86)
|
#endif //defined(VOLK_CPU_x86)
|
||||||
|
|
||||||
static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) {
|
static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit)
|
||||||
|
{
|
||||||
#if defined(VOLK_CPU_x86)
|
#if defined(VOLK_CPU_x86)
|
||||||
unsigned int regs[4] = {0};
|
unsigned int regs[4] = {0};
|
||||||
cpuid_x86_count(level, count, regs);
|
cpuid_x86_count(level, count, regs);
|
||||||
@ -77,10 +81,11 @@ static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) {
|
static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit)
|
||||||
|
{
|
||||||
#if defined(VOLK_CPU_x86)
|
#if defined(VOLK_CPU_x86)
|
||||||
unsigned int regs[4];
|
unsigned int regs[4];
|
||||||
memset(regs, 0, sizeof(unsigned int)*4);
|
memset(regs, 0, sizeof(unsigned int) * 4);
|
||||||
cpuid_x86(op, regs);
|
cpuid_x86(op, regs);
|
||||||
return regs[reg] >> bit & 0x01;
|
return regs[reg] >> bit & 0x01;
|
||||||
#else
|
#else
|
||||||
@ -88,10 +93,11 @@ static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsi
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned int check_extended_cpuid(unsigned int val) {
|
static inline unsigned int check_extended_cpuid(unsigned int val)
|
||||||
|
{
|
||||||
#if defined(VOLK_CPU_x86)
|
#if defined(VOLK_CPU_x86)
|
||||||
unsigned int regs[4];
|
unsigned int regs[4];
|
||||||
memset(regs, 0, sizeof(unsigned int)*4);
|
memset(regs, 0, sizeof(unsigned int) * 4);
|
||||||
cpuid_x86(0x80000000, regs);
|
cpuid_x86(0x80000000, regs);
|
||||||
return regs[0] >= val;
|
return regs[0] >= val;
|
||||||
#else
|
#else
|
||||||
@ -99,7 +105,8 @@ static inline unsigned int check_extended_cpuid(unsigned int val) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned int get_avx_enabled(void) {
|
static inline unsigned int get_avx_enabled(void)
|
||||||
|
{
|
||||||
#if defined(VOLK_CPU_x86)
|
#if defined(VOLK_CPU_x86)
|
||||||
return __xgetbv() & 0x6;
|
return __xgetbv() & 0x6;
|
||||||
#else
|
#else
|
||||||
@ -107,7 +114,8 @@ static inline unsigned int get_avx_enabled(void) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline unsigned int get_avx2_enabled(void) {
|
static inline unsigned int get_avx2_enabled(void)
|
||||||
|
{
|
||||||
#if defined(VOLK_CPU_x86)
|
#if defined(VOLK_CPU_x86)
|
||||||
return __xgetbv() & 0x6;
|
return __xgetbv() & 0x6;
|
||||||
#else
|
#else
|
||||||
@ -117,28 +125,30 @@ static inline unsigned int get_avx2_enabled(void) {
|
|||||||
|
|
||||||
//neon detection is linux specific
|
//neon detection is linux specific
|
||||||
#if defined(__arm__) && defined(__linux__)
|
#if defined(__arm__) && defined(__linux__)
|
||||||
#include <asm/hwcap.h>
|
#include <asm/hwcap.h>
|
||||||
#include <linux/auxvec.h>
|
#include <linux/auxvec.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#define VOLK_CPU_ARM
|
#define VOLK_CPU_ARM
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int has_neon(void){
|
static int has_neon(void)
|
||||||
|
{
|
||||||
#if defined(VOLK_CPU_ARM)
|
#if defined(VOLK_CPU_ARM)
|
||||||
FILE *auxvec_f;
|
FILE *auxvec_f;
|
||||||
unsigned long auxvec[2];
|
unsigned long auxvec[2];
|
||||||
unsigned int found_neon = 0;
|
unsigned int found_neon = 0;
|
||||||
auxvec_f = fopen("/proc/self/auxv", "rb");
|
auxvec_f = fopen("/proc/self/auxv", "rb");
|
||||||
if(!auxvec_f) return 0;
|
if (!auxvec_f) return 0;
|
||||||
|
|
||||||
size_t r = 1;
|
size_t r = 1;
|
||||||
//so auxv is basically 32b of ID and 32b of value
|
//so auxv is basically 32b of ID and 32b of value
|
||||||
//so it goes like this
|
//so it goes like this
|
||||||
while(!found_neon && r) {
|
while (!found_neon && r)
|
||||||
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
|
{
|
||||||
if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
|
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
|
||||||
found_neon = 1;
|
if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
|
||||||
}
|
found_neon = 1;
|
||||||
|
}
|
||||||
|
|
||||||
fclose(auxvec_f);
|
fclose(auxvec_f);
|
||||||
return found_neon;
|
return found_neon;
|
||||||
@ -148,50 +158,59 @@ static int has_neon(void){
|
|||||||
}
|
}
|
||||||
|
|
||||||
%for arch in archs:
|
%for arch in archs:
|
||||||
static int i_can_has_${arch.name} (void) {
|
static int i_can_has_${arch.name} (void)
|
||||||
|
{
|
||||||
%for check, params in arch.checks:
|
%for check, params in arch.checks:
|
||||||
if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0;
|
if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0;
|
||||||
%endfor
|
% endfor return 1;
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
%endfor
|
% endfor
|
||||||
|
|
||||||
#if defined(HAVE_FENV_H)
|
#if defined(HAVE_FENV_H)
|
||||||
#if defined(FE_TONEAREST)
|
#if defined(FE_TONEAREST)
|
||||||
#include <fenv.h>
|
#include <fenv.h>
|
||||||
static inline void set_float_rounding(void){
|
static inline void
|
||||||
fesetround(FE_TONEAREST);
|
set_float_rounding(void)
|
||||||
}
|
{
|
||||||
#else
|
fesetround(FE_TONEAREST);
|
||||||
static inline void set_float_rounding(void){
|
}
|
||||||
//do nothing
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
#elif defined(_MSC_VER)
|
|
||||||
#include <float.h>
|
|
||||||
static inline void set_float_rounding(void){
|
|
||||||
unsigned int cwrd;
|
|
||||||
_controlfp_s(&cwrd, 0, 0);
|
|
||||||
_controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
static inline void set_float_rounding(void){
|
static inline void
|
||||||
//do nothing
|
set_float_rounding(void)
|
||||||
}
|
{
|
||||||
|
//do nothing
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#include <float.h>
|
||||||
|
static inline void
|
||||||
|
set_float_rounding(void)
|
||||||
|
{
|
||||||
|
unsigned int cwrd;
|
||||||
|
_controlfp_s(&cwrd, 0, 0);
|
||||||
|
_controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static inline void
|
||||||
|
set_float_rounding(void)
|
||||||
|
{
|
||||||
|
//do nothing
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void volk_gnsssdr_cpu_init() {
|
void volk_gnsssdr_cpu_init()
|
||||||
|
{
|
||||||
%for arch in archs:
|
%for arch in archs:
|
||||||
volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name};
|
volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name};
|
||||||
%endfor
|
% endfor
|
||||||
set_float_rounding();
|
set_float_rounding();
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int volk_gnsssdr_get_lvarch() {
|
unsigned int volk_gnsssdr_get_lvarch()
|
||||||
|
{
|
||||||
unsigned int retval = 0;
|
unsigned int retval = 0;
|
||||||
volk_gnsssdr_cpu_init();
|
volk_gnsssdr_cpu_init();
|
||||||
%for arch in archs:
|
%for arch in archs:
|
||||||
retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()};
|
retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()};
|
||||||
%endfor
|
% endfor return retval;
|
||||||
return retval;
|
|
||||||
}
|
}
|
||||||
|
@ -23,16 +23,17 @@
|
|||||||
|
|
||||||
__VOLK_DECL_BEGIN
|
__VOLK_DECL_BEGIN
|
||||||
|
|
||||||
struct VOLK_CPU {
|
struct VOLK_CPU
|
||||||
|
{
|
||||||
%for arch in archs:
|
%for arch in archs:
|
||||||
int (*has_${arch.name}) ();
|
int (*has_${arch.name}) ();
|
||||||
%endfor
|
% endfor
|
||||||
};
|
};
|
||||||
|
|
||||||
extern struct VOLK_CPU volk_gnsssdr_cpu;
|
extern struct VOLK_CPU volk_gnsssdr_cpu;
|
||||||
|
|
||||||
void volk_gnsssdr_cpu_init ();
|
void volk_gnsssdr_cpu_init();
|
||||||
unsigned int volk_gnsssdr_get_lvarch ();
|
unsigned int volk_gnsssdr_get_lvarch();
|
||||||
|
|
||||||
__VOLK_DECL_END
|
__VOLK_DECL_END
|
||||||
|
|
||||||
|
@ -20,7 +20,11 @@
|
|||||||
<% arch_names = this_machine.arch_names %>
|
<% arch_names = this_machine.arch_names %>
|
||||||
|
|
||||||
%for arch in this_machine.archs:
|
%for arch in this_machine.archs:
|
||||||
#define LV_HAVE_${arch.name.upper()} 1
|
#define LV_HAVE_$ \
|
||||||
|
{ \
|
||||||
|
arch.name.upper() \
|
||||||
|
} \
|
||||||
|
1
|
||||||
%endfor
|
%endfor
|
||||||
|
|
||||||
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
#include <volk_gnsssdr/volk_gnsssdr_common.h>
|
||||||
@ -35,7 +39,9 @@
|
|||||||
#include <volk_gnsssdr/${kern.name}.h>
|
#include <volk_gnsssdr/${kern.name}.h>
|
||||||
%endfor
|
%endfor
|
||||||
|
|
||||||
struct volk_gnsssdr_machine volk_gnsssdr_machine_${this_machine.name} = {
|
struct volk_gnsssdr_machine volk_gnsssdr_machine_$
|
||||||
|
{
|
||||||
|
this_machine.name} = {
|
||||||
<% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list},
|
<% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list},
|
||||||
<% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name},
|
<% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name},
|
||||||
${this_machine.alignment},
|
${this_machine.alignment},
|
||||||
|
@ -22,10 +22,10 @@
|
|||||||
|
|
||||||
struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = {
|
struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = {
|
||||||
%for machine in machines:
|
%for machine in machines:
|
||||||
#ifdef LV_MACHINE_${machine.name.upper()}
|
#ifdef LV_MACHINE_${machine.name.upper() }
|
||||||
&volk_gnsssdr_machine_${machine.name},
|
&volk_gnsssdr_machine_${machine.name},
|
||||||
#endif
|
#endif
|
||||||
%endfor
|
%endfor
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines)/sizeof(*volk_gnsssdr_machines);
|
unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines) / sizeof(*volk_gnsssdr_machines);
|
||||||
|
@ -27,26 +27,30 @@
|
|||||||
|
|
||||||
__VOLK_DECL_BEGIN
|
__VOLK_DECL_BEGIN
|
||||||
|
|
||||||
struct volk_gnsssdr_machine {
|
struct volk_gnsssdr_machine
|
||||||
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format)
|
{
|
||||||
|
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format)
|
||||||
const char *name;
|
const char *name;
|
||||||
const size_t alignment; //the maximum byte alignment required for functions in this library
|
const size_t alignment; //the maximum byte alignment required for functions in this library
|
||||||
%for kern in kernels:
|
%for kern in kernels:
|
||||||
const char *${kern.name}_name;
|
const char *${kern.name}_name;
|
||||||
const char *${kern.name}_impl_names[<%len_archs=len(archs)%>${len_archs}];
|
const char *${kern.name} _impl_names[<% len_archs = len(archs) %> ${len_archs}];
|
||||||
const int ${kern.name}_impl_deps[${len_archs}];
|
const int ${kern.name} _impl_deps[${len_archs}];
|
||||||
const bool ${kern.name}_impl_alignment[${len_archs}];
|
const bool ${kern.name} _impl_alignment[${len_archs}];
|
||||||
const ${kern.pname} ${kern.name}_impls[${len_archs}];
|
const ${kern.pname} ${kern.name} _impls[${len_archs}];
|
||||||
const size_t ${kern.name}_n_impls;
|
const size_t ${kern.name} _n_impls;
|
||||||
%endfor
|
% endfor
|
||||||
};
|
};
|
||||||
|
|
||||||
%for machine in machines:
|
%for machine in machines:
|
||||||
#ifdef LV_MACHINE_${machine.name.upper()}
|
#ifdef LV_MACHINE_${machine.name.upper() }
|
||||||
extern struct volk_gnsssdr_machine volk_gnsssdr_machine_${machine.name};
|
extern struct volk_gnsssdr_machine volk_gnsssdr_machine_$
|
||||||
|
{
|
||||||
|
machine.name
|
||||||
|
};
|
||||||
#endif
|
#endif
|
||||||
%endfor
|
% endfor
|
||||||
|
|
||||||
__VOLK_DECL_END
|
__VOLK_DECL_END
|
||||||
|
|
||||||
#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
|
#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
|
||||||
|
@ -24,6 +24,6 @@
|
|||||||
|
|
||||||
%for kern in kernels:
|
%for kern in kernels:
|
||||||
typedef void (*${kern.pname})(${kern.arglist_types});
|
typedef void (*${kern.pname})(${kern.arglist_types});
|
||||||
%endfor
|
% endfor
|
||||||
|
|
||||||
#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/
|
#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/
|
||||||
|
Loading…
Reference in New Issue
Block a user