1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-12-16 05:00:35 +00:00

Apply automated code formatting to volk-gnsssdr

See http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting
This commit is contained in:
Carles Fernandez 2018-03-03 12:09:45 +01:00
parent f924005733
commit 891478cf2c
75 changed files with 6642 additions and 6120 deletions

View File

@ -20,30 +20,30 @@
#include <config.h> #include <config.h>
#endif #endif
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine #include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t #include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
#include <volk_gnsssdr/constants.h> // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ... #include <volk_gnsssdr/constants.h> // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ...
#include <iostream> // for operator<<, endl, cout, ostream #include <iostream> // for operator<<, endl, cout, ostream
#include <string> // for string #include <string> // for string
void print_alignment() void print_alignment()
{ {
std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl; std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl;
} }
void print_malloc() void print_malloc()
{ {
// You don't want to change the volk_malloc code, so just copy the if/else // You don't want to change the volk_malloc code, so just copy the if/else
// structure from there and give an explanation for the implementations // structure from there and give an explanation for the implementations
std::cout << "Used malloc implementation: "; std::cout << "Used malloc implementation: ";
#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN #if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
std::cout << "posix_memalign" << std::endl; std::cout << "posix_memalign" << std::endl;
#elif _MSC_VER >= 1400 #elif _MSC_VER >= 1400
std::cout << "aligned_malloc" << std::endl; std::cout << "aligned_malloc" << std::endl;
#else #else
std::cout << "No standard handler available, using own implementation." << std::endl; std::cout << "No standard handler available, using own implementation." << std::endl;
#endif #endif
} }
@ -54,22 +54,24 @@ int main(int argc, char **argv)
our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler())); our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler()));
our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags())); our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags()));
our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines())); our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines()));
our_options.add(option_t("avail-machines", "", "print VOLK_GNSSSDR machines on the current " our_options.add(option_t("avail-machines", "",
"platform", volk_gnsssdr_list_machines)); "print VOLK_GNSSSDR machines on the current "
"platform",
volk_gnsssdr_list_machines));
our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used", our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used",
volk_gnsssdr_get_machine())); volk_gnsssdr_get_machine()));
our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment)); our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment));
our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_gnsssdr_malloc", our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_gnsssdr_malloc",
print_malloc)); print_malloc));
our_options.add(option_t("version", "v", "print the VOLK_GNSSSDR version", volk_gnsssdr_version())); our_options.add(option_t("version", "v", "print the VOLK_GNSSSDR version", volk_gnsssdr_version()));
try try
{ {
our_options.parse(argc, argv); our_options.parse(argc, argv);
} }
catch(...) catch (...)
{ {
return 1; return 1;
} }
return 0; return 0;
} }

View File

@ -17,157 +17,182 @@
*/ */
#include "volk_gnsssdr_option_helpers.h" #include "volk_gnsssdr_option_helpers.h"
#include <climits> // IWYU pragma: keep #include <climits> // IWYU pragma: keep
#include <cstdlib> // IWYU pragma: keep #include <cstdlib> // IWYU pragma: keep
#include <cstring> // IWYU pragma: keep #include <cstring> // IWYU pragma: keep
#include <exception> // for exception #include <exception> // for exception
#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream #include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
#include <utility> // for pair #include <utility> // for pair
/* /*
* Option type * Option type
*/ */
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()) option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)())
: longform("--" + longform), : longform("--" + longform),
shortform("-" + shortform), shortform("-" + shortform),
msg(msg), msg(msg),
callback(callback) { option_type = VOID_CALLBACK; } callback(callback) { option_type = VOID_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)) option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int))
: longform("--" + longform), : longform("--" + longform),
shortform("-" + shortform), shortform("-" + shortform),
msg(msg), msg(msg),
callback((void (*)()) callback) { option_type = INT_CALLBACK; } callback((void (*)())callback) { option_type = INT_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float)) option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float))
: longform("--" + longform), : longform("--" + longform),
shortform("-" + shortform), shortform("-" + shortform),
msg(msg), msg(msg),
callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; } callback((void (*)())callback) { option_type = FLOAT_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool)) option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool))
: longform("--" + longform), : longform("--" + longform),
shortform("-" + shortform), shortform("-" + shortform),
msg(msg), msg(msg),
callback((void (*)()) callback) { option_type = BOOL_CALLBACK; } callback((void (*)())callback) { option_type = BOOL_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string)) option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string))
: longform("--" + longform), : longform("--" + longform),
shortform("-" + shortform), shortform("-" + shortform),
msg(msg), msg(msg),
callback((void (*)()) callback) { option_type = STRING_CALLBACK; } callback((void (*)())callback) { option_type = STRING_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval) option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval)
: longform("--" + longform), : longform("--" + longform),
shortform("-" + shortform), shortform("-" + shortform),
msg(msg), msg(msg),
printval(printval) { option_type = STRING; } printval(printval) { option_type = STRING; }
/* /*
* Option List * Option List
*/ */
option_list::option_list(std::string program_name) : option_list::option_list(std::string program_name) : program_name(program_name)
program_name(program_name) { {
{ internal_list = std::vector<option_t>(); } {
} internal_list = std::vector<option_t>();
void option_list::add(const option_t & opt) { internal_list.push_back(opt); }
void option_list::parse(int argc, char **argv) {
for (int arg_number = 0; arg_number < argc; ++arg_number) {
for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end();
this_option++) {
if (this_option->longform == std::string(argv[arg_number]) ||
this_option->shortform == std::string(argv[arg_number])) {
switch (this_option->option_type) {
case VOID_CALLBACK:
this_option->callback();
break;
case INT_CALLBACK:
try {
int int_val = std::stoi(argv[++arg_number]);
((void (*)(int)) this_option->callback)(int_val);
} catch (std::exception &exc) {
std::cout << "An int option can only receive a number" << std::endl;
throw std::exception();
};
break;
case FLOAT_CALLBACK:
try {
int int_val = std::stof(argv[++arg_number]);
((void (*)(float)) this_option->callback)(int_val);
} catch (std::exception &exc) {
std::cout << "A float option can only receive a number" << std::endl;
throw std::exception();
};
break;
case BOOL_CALLBACK:
try {
bool int_val = (bool) std::stoi(argv[++arg_number]);
((void (*)(bool)) this_option->callback)(int_val);
} catch (std::exception &exc) {
std::cout << "A bool option can only receive 0 or 1" << std::endl;
throw std::exception();
};
break;
case STRING_CALLBACK:
try {
((void (*)(std::string)) this_option->callback)(argv[++arg_number]);
} catch (std::exception &exc) {
throw std::exception();
};
break;
case STRING:
std::cout << this_option->printval << std::endl;
break;
default:
this_option->callback();
break;
}
}
}
if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number])) {
help();
}
} }
} }
void option_list::help() { void option_list::add(const option_t &opt) { internal_list.push_back(opt); }
void option_list::parse(int argc, char **argv)
{
for (int arg_number = 0; arg_number < argc; ++arg_number)
{
for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end();
this_option++)
{
if (this_option->longform == std::string(argv[arg_number]) ||
this_option->shortform == std::string(argv[arg_number]))
{
switch (this_option->option_type)
{
case VOID_CALLBACK:
this_option->callback();
break;
case INT_CALLBACK:
try
{
int int_val = std::stoi(argv[++arg_number]);
((void (*)(int))this_option->callback)(int_val);
}
catch (std::exception &exc)
{
std::cout << "An int option can only receive a number" << std::endl;
throw std::exception();
};
break;
case FLOAT_CALLBACK:
try
{
int int_val = std::stof(argv[++arg_number]);
((void (*)(float))this_option->callback)(int_val);
}
catch (std::exception &exc)
{
std::cout << "A float option can only receive a number" << std::endl;
throw std::exception();
};
break;
case BOOL_CALLBACK:
try
{
bool int_val = (bool)std::stoi(argv[++arg_number]);
((void (*)(bool))this_option->callback)(int_val);
}
catch (std::exception &exc)
{
std::cout << "A bool option can only receive 0 or 1" << std::endl;
throw std::exception();
};
break;
case STRING_CALLBACK:
try
{
((void (*)(std::string))this_option->callback)(argv[++arg_number]);
}
catch (std::exception &exc)
{
throw std::exception();
};
break;
case STRING:
std::cout << this_option->printval << std::endl;
break;
default:
this_option->callback();
break;
}
}
}
if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number]))
{
help();
}
}
}
void option_list::help()
{
std::cout << program_name << std::endl; std::cout << program_name << std::endl;
std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl; std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl;
for (std::vector<option_t>::iterator this_option = internal_list.begin(); for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end(); this_option != internal_list.end();
this_option++) { this_option++)
std::string help_line(" "); {
if (this_option->shortform == "-") { std::string help_line(" ");
help_line += this_option->longform + " "; if (this_option->shortform == "-")
} else { {
help_line += this_option->shortform + " [ " + this_option->longform + " ]"; help_line += this_option->longform + " ";
} }
else
{
help_line += this_option->shortform + " [ " + this_option->longform + " ]";
}
switch (help_line.size() / 8) { switch (help_line.size() / 8)
case 0: {
help_line += "\t\t\t\t"; case 0:
break; help_line += "\t\t\t\t";
case 1: break;
help_line += "\t\t\t"; case 1:
break; help_line += "\t\t\t";
case 2: break;
help_line += "\t\t"; case 2:
break; help_line += "\t\t";
case 3: break;
help_line += "\t"; case 3:
break; help_line += "\t";
default: break;
break; default:
break;
}
help_line += this_option->msg;
std::cout << help_line << std::endl;
} }
help_line += this_option->msg;
std::cout << help_line << std::endl;
}
} }

View File

@ -36,7 +36,8 @@ typedef enum
STRING, STRING,
} VOLK_OPTYPE; } VOLK_OPTYPE;
class option_t { class option_t
{
public: public:
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()); option_t(std::string longform, std::string shortform, std::string msg, void (*callback)());
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)); option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int));
@ -51,7 +52,6 @@ public:
VOLK_OPTYPE option_type; VOLK_OPTYPE option_type;
std::string printval; std::string printval;
void (*callback)(); void (*callback)();
}; };
class option_list class option_list
@ -59,15 +59,16 @@ class option_list
public: public:
option_list(std::string program_name); option_list(std::string program_name);
void add(const option_t & opt); void add(const option_t &opt);
void parse(int argc, char **argv); void parse(int argc, char **argv);
void help(); void help();
private: private:
std::string program_name; std::string program_name;
std::vector<option_t> internal_list; std::vector<option_t> internal_list;
}; };
#endif //VOLK_VOLK_OPTION_HELPERS_H #endif //VOLK_VOLK_OPTION_HELPERS_H

View File

@ -16,23 +16,22 @@
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>. * along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*/ */
#include "kernel_tests.h" // for init_test_list #include "kernel_tests.h" // for init_test_list
#include "qa_utils.h" // for volk_gnsssdr_test_results_t #include "qa_utils.h" // for volk_gnsssdr_test_results_t
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t #include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t #include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
#include "volk_gnsssdr_profile.h" #include "volk_gnsssdr_profile.h"
#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path #include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path
#include <boost/filesystem/operations.hpp> // for create_directories, exists #include <boost/filesystem/operations.hpp> // for create_directories, exists
#include <boost/filesystem/path.hpp> // for path, operator<< #include <boost/filesystem/path.hpp> // for path, operator<<
#include <boost/filesystem/path_traits.hpp> // for filesystem #include <boost/filesystem/path_traits.hpp> // for filesystem
#include <sys/stat.h> // for stat #include <sys/stat.h> // for stat
#include <cstddef> // for size_t #include <cstddef> // for size_t
#include <iostream> // for operator<<, basic_ostream #include <iostream> // for operator<<, basic_ostream
#include <fstream> // IWYU pragma: keep #include <fstream> // IWYU pragma: keep
#include <map> // for map, map<>::iterator #include <map> // for map, map<>::iterator
#include <utility> // for pair #include <utility> // for pair
#include <vector> // for vector, vector<>::const_.. #include <vector> // for vector, vector<>::const_..
namespace fs = boost::filesystem; namespace fs = boost::filesystem;
@ -67,92 +66,112 @@ int main(int argc, char *argv[])
profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config))); profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config)));
try try
{ {
profile_options.parse(argc, argv); profile_options.parse(argc, argv);
} }
catch(...) catch (...)
{ {
return 1; return 1;
} }
for (int arg_number = 0; arg_number < argc; ++arg_number) { for (int arg_number = 0; arg_number < argc; ++arg_number)
{
if (std::string("--help") == std::string(argv[arg_number]) || if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number])) { std::string("-h") == std::string(argv[arg_number]))
{
return 0; return 0;
} }
} }
// Adding program options // Adding program options
std::ofstream json_file; std::ofstream json_file;
std::string config_file; std::string config_file;
if ( json_filename != "" ) { if (json_filename != "")
json_file.open( json_filename.c_str() ); {
} json_file.open(json_filename.c_str());
}
if ( volk_config_path != "" ) { if (volk_config_path != "")
config_file = volk_config_path + "/volk_config"; {
} config_file = volk_config_path + "/volk_config";
}
// Run tests // Run tests
std::vector<volk_gnsssdr_test_results_t> results; std::vector<volk_gnsssdr_test_results_t> results;
if(update_mode) { if (update_mode)
if( config_file != "" ) read_results(&results, config_file); {
else read_results(&results); if (config_file != "")
} read_results(&results, config_file);
else
read_results(&results);
}
// Initialize the list of tests // Initialize the list of tests
std::vector<volk_gnsssdr_test_case_t> test_cases = init_test_list(test_params); std::vector<volk_gnsssdr_test_case_t> test_cases = init_test_list(test_params);
// Iterate through list of tests running each one // Iterate through list of tests running each one
std::string substr_to_match(test_params.kernel_regex()); std::string substr_to_match(test_params.kernel_regex());
for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { for (unsigned int ii = 0; ii < test_cases.size(); ++ii)
bool regex_match = true; {
bool regex_match = true;
volk_gnsssdr_test_case_t test_case = test_cases[ii]; volk_gnsssdr_test_case_t test_case = test_cases[ii];
// if the kernel name matches regex then do the test // if the kernel name matches regex then do the test
std::string test_case_name = test_case.name(); std::string test_case_name = test_case.name();
if(test_case_name.find(substr_to_match) == std::string::npos) { if (test_case_name.find(substr_to_match) == std::string::npos)
regex_match = false; {
} regex_match = false;
// if we are in update mode check if we've already got results
// if we have any, then no need to test that kernel
bool update = true;
if(update_mode) {
for(unsigned int jj=0; jj < results.size(); ++jj) {
if(results[jj].name == test_case.name() ||
results[jj].name == test_case.puppet_master_name()) {
update = false;
break;
} }
}
}
if( regex_match && update ) { // if we are in update mode check if we've already got results
try { // if we have any, then no need to test that kernel
run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), bool update = true;
test_case.test_parameters(), &results, test_case.puppet_master_name()); if (update_mode)
} {
catch (std::string &error) { for (unsigned int jj = 0; jj < results.size(); ++jj)
std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl; {
} if (results[jj].name == test_case.name() ||
results[jj].name == test_case.puppet_master_name())
{
update = false;
break;
}
}
}
if (regex_match && update)
{
try
{
run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
test_case.test_parameters(), &results, test_case.puppet_master_name());
}
catch (std::string &error)
{
std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl;
}
}
} }
}
// Output results according to provided options // Output results according to provided options
if(json_filename != "") { if (json_filename != "")
write_json(json_file, results); {
json_file.close(); write_json(json_file, results);
} json_file.close();
}
if(!dry_run) { if (!dry_run)
if(config_file != "") write_results(&results, false, config_file); {
else write_results(&results, false); if (config_file != "")
} write_results(&results, false, config_file);
else { else
std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; write_results(&results, false);
} }
else
{
std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
}
} }
@ -167,51 +186,55 @@ void read_results(std::vector<volk_gnsssdr_test_results_t> *results)
void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string path) void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string path)
{ {
struct stat buffer; struct stat buffer;
bool config_status = (stat (path.c_str(), &buffer) == 0); bool config_status = (stat(path.c_str(), &buffer) == 0);
if( config_status ) { if (config_status)
// a config exists and we are reading results from it {
std::ifstream config(path.c_str()); // a config exists and we are reading results from it
char config_line[256]; std::ifstream config(path.c_str());
while(config.getline(config_line, 255)) { char config_line[256];
// tokenize the input line by kernel_name unaligned aligned while (config.getline(config_line, 255))
// then push back in the results vector with fields filled in {
// tokenize the input line by kernel_name unaligned aligned
// then push back in the results vector with fields filled in
std::vector<std::string> single_kernel_result; std::vector<std::string> single_kernel_result;
std::string config_str(config_line); std::string config_str(config_line);
std::size_t str_size = config_str.size(); std::size_t str_size = config_str.size();
std::size_t found = 1; std::size_t found = 1;
found = config_str.find(' ');
// Split line by spaces
while(found && found < str_size) {
found = config_str.find(' '); found = config_str.find(' ');
// kernel names MUST be less than 128 chars, which is // Split line by spaces
// a length restricted by volk/volk_prefs.c while (found && found < str_size)
// on the last token in the parsed string we won't find a space {
// so make sure we copy at most 128 chars. found = config_str.find(' ');
if(found > 127) { // kernel names MUST be less than 128 chars, which is
found = 127; // a length restricted by volk/volk_prefs.c
} // on the last token in the parsed string we won't find a space
str_size = config_str.size(); // so make sure we copy at most 128 chars.
char buffer[128] = {'\0'}; if (found > 127)
config_str.copy(buffer, found + 1, 0); {
buffer[found] = '\0'; found = 127;
single_kernel_result.push_back(std::string(buffer)); }
config_str.erase(0, found+1); str_size = config_str.size();
} char buffer[128] = {'\0'};
config_str.copy(buffer, found + 1, 0);
buffer[found] = '\0';
single_kernel_result.push_back(std::string(buffer));
config_str.erase(0, found + 1);
}
if(single_kernel_result.size() == 3) { if (single_kernel_result.size() == 3)
volk_gnsssdr_test_results_t kernel_result; {
kernel_result.name = std::string(single_kernel_result[0]); volk_gnsssdr_test_results_t kernel_result;
kernel_result.config_name = std::string(single_kernel_result[0]); kernel_result.name = std::string(single_kernel_result[0]);
kernel_result.best_arch_u = std::string(single_kernel_result[1]); kernel_result.config_name = std::string(single_kernel_result[0]);
kernel_result.best_arch_a = std::string(single_kernel_result[2]); kernel_result.best_arch_u = std::string(single_kernel_result[1]);
results->push_back(kernel_result); kernel_result.best_arch_a = std::string(single_kernel_result[2]);
} results->push_back(kernel_result);
}
}
} }
}
} }
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result) void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result)
@ -219,7 +242,7 @@ void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool
char path[1024]; char path[1024];
volk_gnsssdr_get_config_path(path); volk_gnsssdr_get_config_path(path);
write_results( results, update_result, std::string(path)); write_results(results, update_result, std::string(path));
} }
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result, const std::string path) void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result, const std::string path)
@ -227,39 +250,44 @@ void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool
const fs::path config_path(path); const fs::path config_path(path);
// Until we can update the config on a kernel by kernel basis // Until we can update the config on a kernel by kernel basis
// do not overwrite volk_gnsssdr_config when using a regex. // do not overwrite volk_gnsssdr_config when using a regex.
if (! fs::exists(config_path.branch_path())) if (!fs::exists(config_path.branch_path()))
{ {
std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl; std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl;
fs::create_directories(config_path.branch_path()); fs::create_directories(config_path.branch_path());
} }
std::ofstream config; std::ofstream config;
if(update_result) { if (update_result)
std::cout << "Updating " << path << " ..." << std::endl; {
config.open(path.c_str(), std::ofstream::app); std::cout << "Updating " << path << " ..." << std::endl;
if (!config.is_open()) { //either we don't have write access or we don't have the dir yet config.open(path.c_str(), std::ofstream::app);
std::cout << "Error opening file " << path << std::endl; if (!config.is_open())
} { //either we don't have write access or we don't have the dir yet
} std::cout << "Error opening file " << path << std::endl;
else { }
std::cout << "Writing " << path << " ..." << std::endl;
config.open(path.c_str());
if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << path << std::endl;
} }
else
{
std::cout << "Writing " << path << " ..." << std::endl;
config.open(path.c_str());
if (!config.is_open())
{ //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << path << std::endl;
}
config << "\ config << "\
#this file is generated by volk_gnsssdr_profile.\n\ #this file is generated by volk_gnsssdr_profile.\n\
#the function name is followed by the preferred architecture.\n\ #the function name is followed by the preferred architecture.\n\
"; ";
} }
std::vector<volk_gnsssdr_test_results_t>::const_iterator profile_results; std::vector<volk_gnsssdr_test_results_t>::const_iterator profile_results;
for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) { for (profile_results = results->begin(); profile_results != results->end(); ++profile_results)
config << profile_results->config_name << " " {
<< profile_results->best_arch_a << " " config << profile_results->config_name << " "
<< profile_results->best_arch_u << std::endl; << profile_results->best_arch_a << " "
} << profile_results->best_arch_u << std::endl;
}
config.close(); config.close();
} }
@ -270,43 +298,45 @@ void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_
size_t len = results.size(); size_t len = results.size();
size_t i = 0; size_t i = 0;
std::vector<volk_gnsssdr_test_results_t>::iterator result; std::vector<volk_gnsssdr_test_results_t>::iterator result;
for(result = results.begin(); result != results.end(); ++result) { for (result = results.begin(); result != results.end(); ++result)
json_file << " {" << std::endl; {
json_file << " \"name\": \"" << result->name << "\"," << std::endl; json_file << " {" << std::endl;
json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; json_file << " \"name\": \"" << result->name << "\"," << std::endl;
json_file << " \"iter\": " << result->iter << "," << std::endl; json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
json_file << " \"best_arch_a\": \"" << result->best_arch_a json_file << " \"iter\": " << result->iter << "," << std::endl;
<< "\"," << std::endl; json_file << " \"best_arch_a\": \"" << result->best_arch_a
json_file << " \"best_arch_u\": \"" << result->best_arch_u << "\"," << std::endl;
<< "\"," << std::endl; json_file << " \"best_arch_u\": \"" << result->best_arch_u
json_file << " \"results\": {" << std::endl; << "\"," << std::endl;
size_t results_len = result->results.size(); json_file << " \"results\": {" << std::endl;
size_t ri = 0; size_t results_len = result->results.size();
size_t ri = 0;
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair; std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) { for (kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair)
volk_gnsssdr_test_time_t time = kernel_time_pair->second; {
json_file << " \"" << time.name << "\": {" << std::endl; volk_gnsssdr_test_time_t time = kernel_time_pair->second;
json_file << " \"name\": \"" << time.name << "\"," << std::endl; json_file << " \"" << time.name << "\": {" << std::endl;
json_file << " \"time\": " << time.time << "," << std::endl; json_file << " \"name\": \"" << time.name << "\"," << std::endl;
json_file << " \"units\": \"" << time.units << "\"" << std::endl; json_file << " \"time\": " << time.time << "," << std::endl;
json_file << " }" ; json_file << " \"units\": \"" << time.units << "\"" << std::endl;
if(ri+1 != results_len) { json_file << " }";
json_file << ","; if (ri + 1 != results_len)
} {
json_file << ",";
}
json_file << std::endl;
ri++;
}
json_file << " }" << std::endl;
json_file << " }";
if (i + 1 != len)
{
json_file << ",";
}
json_file << std::endl; json_file << std::endl;
ri++; i++;
} }
json_file << " }" << std::endl;
json_file << " }";
if(i+1 != len) {
json_file << ",";
}
json_file << std::endl;
i++;
}
json_file << " ]" << std::endl; json_file << " ]" << std::endl;
json_file << "}" << std::endl; json_file << "}" << std::endl;
} }

View File

@ -27,10 +27,10 @@
* ------------------------------------------------------------------------- * -------------------------------------------------------------------------
*/ */
#include <cstdbool> // for bool #include <cstdbool> // for bool
#include <iosfwd> // for ofstream #include <iosfwd> // for ofstream
#include <string> // for string #include <string> // for string
#include <vector> // for vector #include <vector> // for vector
class volk_test_results_t; class volk_test_results_t;

View File

@ -29,7 +29,7 @@
static inline int16_t sat_adds16i(int16_t x, int16_t y) static inline int16_t sat_adds16i(int16_t x, int16_t y)
{ {
int32_t res = (int32_t) x + (int32_t) y; int32_t res = (int32_t)x + (int32_t)y;
if (res < SHRT_MIN) res = SHRT_MIN; if (res < SHRT_MIN) res = SHRT_MIN;
if (res > SHRT_MAX) res = SHRT_MAX; if (res > SHRT_MAX) res = SHRT_MAX;
@ -39,7 +39,7 @@ static inline int16_t sat_adds16i(int16_t x, int16_t y)
static inline int16_t sat_muls16i(int16_t x, int16_t y) static inline int16_t sat_muls16i(int16_t x, int16_t y)
{ {
int32_t res = (int32_t) x * (int32_t) y; int32_t res = (int32_t)x * (int32_t)y;
if (res < SHRT_MIN) res = SHRT_MIN; if (res < SHRT_MIN) res = SHRT_MIN;
if (res > SHRT_MAX) res = SHRT_MAX; if (res > SHRT_MAX) res = SHRT_MAX;

View File

@ -30,38 +30,42 @@
static inline __m256 static inline __m256
_mm256_complexmul_ps(__m256 x, __m256 y) _mm256_complexmul_ps(__m256 x, __m256 y)
{ {
__m256 yl, yh, tmp1, tmp2; __m256 yl, yh, tmp1, tmp2;
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ... yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ... yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ... tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ... x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
} }
static inline __m256 static inline __m256
_mm256_conjugate_ps(__m256 x){ _mm256_conjugate_ps(__m256 x)
const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); {
return _mm256_xor_ps(x, conjugator); // conjugate y const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
return _mm256_xor_ps(x, conjugator); // conjugate y
} }
static inline __m256 static inline __m256
_mm256_complexconjugatemul_ps(__m256 x, __m256 y){ _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
y = _mm256_conjugate_ps(y); {
return _mm256_complexmul_ps(x, y); y = _mm256_conjugate_ps(y);
return _mm256_complexmul_ps(x, y);
} }
static inline __m256 static inline __m256
_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){ _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
__m256 complex1, complex2; {
cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values __m256 complex1, complex2;
cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20); cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31); complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
} }
static inline __m256 _mm256_complexnormalise_ps( __m256 z ){ static inline __m256 _mm256_complexnormalise_ps(__m256 z)
{
__m256 tmp1 = _mm256_mul_ps(z, z); __m256 tmp1 = _mm256_mul_ps(z, z);
__m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1); __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
@ -70,8 +74,9 @@ static inline __m256 _mm256_complexnormalise_ps( __m256 z ){
} }
static inline __m256 static inline __m256
_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){ _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); {
return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
} }
#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */ #endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */

View File

@ -28,14 +28,14 @@
// Cross-platform attribute macros not included in VOLK // Cross-platform attribute macros not included in VOLK
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
#if defined __GNUC__ #if defined __GNUC__
# define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr) #define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr)
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality) #define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
#elif _MSC_VER #elif _MSC_VER
# define __VOLK_GNSSSDR_PREFETCH(addr) #define __VOLK_GNSSSDR_PREFETCH(addr)
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) #define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
#else #else
# define __VOLK_GNSSSDR_PREFETCH(addr) #define __VOLK_GNSSSDR_PREFETCH(addr)
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) #define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
#endif #endif
#ifndef INCLUDED_LIBVOLK_COMMON_H #ifndef INCLUDED_LIBVOLK_COMMON_H
@ -45,45 +45,45 @@
// Cross-platform attribute macros // Cross-platform attribute macros
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
#if defined __GNUC__ #if defined __GNUC__
# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x))) #define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
# define __VOLK_ATTR_UNUSED __attribute__((unused)) #define __VOLK_ATTR_UNUSED __attribute__((unused))
# define __VOLK_ATTR_INLINE __attribute__((always_inline)) #define __VOLK_ATTR_INLINE __attribute__((always_inline))
# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated)) #define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
# define __VOLK_ASM __asm__ #define __VOLK_ASM __asm__
# define __VOLK_VOLATILE __volatile__ #define __VOLK_VOLATILE __volatile__
# if __GNUC__ >= 4 #if __GNUC__ >= 4
# define __VOLK_ATTR_EXPORT __attribute__((visibility("default"))) #define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
# define __VOLK_ATTR_IMPORT __attribute__((visibility("default"))) #define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
# else
# define __VOLK_ATTR_EXPORT
# define __VOLK_ATTR_IMPORT
# endif
#elif _MSC_VER
# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
# define __VOLK_ATTR_UNUSED
# define __VOLK_ATTR_INLINE __forceinline
# define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
# define __VOLK_ATTR_EXPORT __declspec(dllexport)
# define __VOLK_ATTR_IMPORT __declspec(dllimport)
# define __VOLK_ASM __asm
# define __VOLK_VOLATILE
#else #else
# define __VOLK_ATTR_ALIGNED(x) #define __VOLK_ATTR_EXPORT
# define __VOLK_ATTR_UNUSED #define __VOLK_ATTR_IMPORT
# define __VOLK_ATTR_INLINE #endif
# define __VOLK_ATTR_DEPRECATED #elif _MSC_VER
# define __VOLK_ATTR_EXPORT #define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
# define __VOLK_ATTR_IMPORT #define __VOLK_ATTR_UNUSED
# define __VOLK_ASM __asm__ #define __VOLK_ATTR_INLINE __forceinline
# define __VOLK_VOLATILE __volatile__ #define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
#define __VOLK_ATTR_EXPORT __declspec(dllexport)
#define __VOLK_ATTR_IMPORT __declspec(dllimport)
#define __VOLK_ASM __asm
#define __VOLK_VOLATILE
#else
#define __VOLK_ATTR_ALIGNED(x)
#define __VOLK_ATTR_UNUSED
#define __VOLK_ATTR_INLINE
#define __VOLK_ATTR_DEPRECATED
#define __VOLK_ATTR_EXPORT
#define __VOLK_ATTR_IMPORT
#define __VOLK_ASM __asm__
#define __VOLK_VOLATILE __volatile__
#endif #endif
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// Ignore annoying warnings in MSVC // Ignore annoying warnings in MSVC
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
#if defined(_MSC_VER) #if defined(_MSC_VER)
# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data #pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2' #pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
#endif #endif
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -91,11 +91,13 @@
// FIXME: due to the usage of complex.h, require gcc for c-linkage // FIXME: due to the usage of complex.h, require gcc for c-linkage
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
#if defined(__cplusplus) && (__GNUC__) #if defined(__cplusplus) && (__GNUC__)
# define __VOLK_DECL_BEGIN extern "C" { #define __VOLK_DECL_BEGIN \
# define __VOLK_DECL_END } extern "C" \
{
#define __VOLK_DECL_END }
#else #else
# define __VOLK_DECL_BEGIN #define __VOLK_DECL_BEGIN
# define __VOLK_DECL_END #define __VOLK_DECL_END
#endif #endif
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -103,9 +105,9 @@
// http://gcc.gnu.org/wiki/Visibility // http://gcc.gnu.org/wiki/Visibility
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
#ifdef volk_gnsssdr_EXPORTS #ifdef volk_gnsssdr_EXPORTS
# define VOLK_API __VOLK_ATTR_EXPORT #define VOLK_API __VOLK_ATTR_EXPORT
#else #else
# define VOLK_API __VOLK_ATTR_IMPORT #define VOLK_API __VOLK_ATTR_IMPORT
#endif #endif
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
@ -121,35 +123,37 @@
#endif #endif
#endif #endif
union bit128{ union bit128
uint8_t i8[16]; {
uint16_t i16[8]; uint8_t i8[16];
uint32_t i[4]; uint16_t i16[8];
float f[4]; uint32_t i[4];
double d[2]; float f[4];
double d[2];
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
__m128 float_vec; __m128 float_vec;
#endif #endif
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
__m128i int_vec; __m128i int_vec;
__m128d double_vec; __m128d double_vec;
#endif #endif
}; };
union bit256{ union bit256
uint8_t i8[32]; {
uint16_t i16[16]; uint8_t i8[32];
uint32_t i[8]; uint16_t i16[16];
float f[8]; uint32_t i[8];
double d[4]; float f[8];
double d[4];
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
__m256 float_vec; __m256 float_vec;
__m256i int_vec; __m256i int_vec;
__m256d double_vec; __m256d double_vec;
#endif #endif
}; };
#define bit128_p(x) ((union bit128 *)(x)) #define bit128_p(x) ((union bit128 *)(x))

View File

@ -48,26 +48,34 @@
#include <complex> #include <complex>
#include <stdint.h> #include <stdint.h>
typedef std::complex<int8_t> lv_8sc_t; typedef std::complex<int8_t> lv_8sc_t;
typedef std::complex<int16_t> lv_16sc_t; typedef std::complex<int16_t> lv_16sc_t;
typedef std::complex<int32_t> lv_32sc_t; typedef std::complex<int32_t> lv_32sc_t;
typedef std::complex<int64_t> lv_64sc_t; typedef std::complex<int64_t> lv_64sc_t;
typedef std::complex<float> lv_32fc_t; typedef std::complex<float> lv_32fc_t;
typedef std::complex<double> lv_64fc_t; typedef std::complex<double> lv_64fc_t;
template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){ template <typename T>
inline std::complex<T> lv_cmake(const T &r, const T &i)
{
return std::complex<T>(r, i); return std::complex<T>(r, i);
} }
template <typename T> inline typename T::value_type lv_creal(const T &x){ template <typename T>
inline typename T::value_type lv_creal(const T &x)
{
return x.real(); return x.real();
} }
template <typename T> inline typename T::value_type lv_cimag(const T &x){ template <typename T>
inline typename T::value_type lv_cimag(const T &x)
{
return x.imag(); return x.imag();
} }
template <typename T> inline T lv_conj(const T &x){ template <typename T>
inline T lv_conj(const T &x)
{
return std::conj(x); return std::conj(x);
} }
@ -80,14 +88,14 @@ template <typename T> inline T lv_conj(const T &x){
#include <complex.h> #include <complex.h>
typedef char complex lv_8sc_t; typedef char complex lv_8sc_t;
typedef short complex lv_16sc_t; typedef short complex lv_16sc_t;
typedef long complex lv_32sc_t; typedef long complex lv_32sc_t;
typedef long long complex lv_64sc_t; typedef long long complex lv_64sc_t;
typedef float complex lv_32fc_t; typedef float complex lv_32fc_t;
typedef double complex lv_64fc_t; typedef double complex lv_64fc_t;
#define lv_cmake(r, i) ((r) + _Complex_I*(i)) #define lv_cmake(r, i) ((r) + _Complex_I * (i))
// When GNUC is available, use the complex extensions. // When GNUC is available, use the complex extensions.
// The extensions always return the correct value type. // The extensions always return the correct value type.

View File

@ -27,30 +27,30 @@
#include <arm_neon.h> #include <arm_neon.h>
static inline float32x4_t vdivq_f32( float32x4_t num, float32x4_t den ) static inline float32x4_t vdivq_f32(float32x4_t num, float32x4_t den)
{ {
const float32x4_t q_inv0 = vrecpeq_f32( den ); const float32x4_t q_inv0 = vrecpeq_f32(den);
const float32x4_t q_step0 = vrecpsq_f32( q_inv0, den ); const float32x4_t q_step0 = vrecpsq_f32(q_inv0, den);
const float32x4_t q_inv1 = vmulq_f32( q_step0, q_inv0 ); const float32x4_t q_inv1 = vmulq_f32(q_step0, q_inv0);
return vmulq_f32( num, q_inv1 ); return vmulq_f32(num, q_inv1);
} }
static inline float32x4_t vsqrtq_f32( float32x4_t q_x ) static inline float32x4_t vsqrtq_f32(float32x4_t q_x)
{ {
const float32x4_t q_step_0 = vrsqrteq_f32( q_x ); const float32x4_t q_step_0 = vrsqrteq_f32(q_x);
// step // step
const float32x4_t q_step_parm0 = vmulq_f32( q_x, q_step_0 ); const float32x4_t q_step_parm0 = vmulq_f32(q_x, q_step_0);
const float32x4_t q_step_result0 = vrsqrtsq_f32( q_step_parm0, q_step_0 ); const float32x4_t q_step_result0 = vrsqrtsq_f32(q_step_parm0, q_step_0);
// step // step
const float32x4_t q_step_1 = vmulq_f32( q_step_0, q_step_result0 ); const float32x4_t q_step_1 = vmulq_f32(q_step_0, q_step_result0);
const float32x4_t q_step_parm1 = vmulq_f32( q_x, q_step_1 ); const float32x4_t q_step_parm1 = vmulq_f32(q_x, q_step_1);
const float32x4_t q_step_result1 = vrsqrtsq_f32( q_step_parm1, q_step_1 ); const float32x4_t q_step_result1 = vrsqrtsq_f32(q_step_parm1, q_step_1);
// take the res // take the res
const float32x4_t q_step_2 = vmulq_f32( q_step_1, q_step_result1 ); const float32x4_t q_step_2 = vmulq_f32(q_step_1, q_step_result1);
// mul by x to get sqrt, not rsqrt // mul by x to get sqrt, not rsqrt
return vmulq_f32( q_x, q_step_2 ); return vmulq_f32(q_x, q_step_2);
} }
#endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */ #endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */

View File

@ -32,9 +32,9 @@ __VOLK_DECL_BEGIN
typedef struct volk_gnsssdr_arch_pref typedef struct volk_gnsssdr_arch_pref
{ {
char name[128]; //name of the kernel char name[128]; //name of the kernel
char impl_a[128]; //best aligned impl char impl_a[128]; //best aligned impl
char impl_u[128]; //best unaligned impl char impl_u[128]; //best unaligned impl
} volk_gnsssdr_arch_pref_t; } volk_gnsssdr_arch_pref_t;
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////

View File

@ -30,33 +30,35 @@
static inline __m128 static inline __m128
_mm_complexmul_ps(__m128 x, __m128 y) _mm_complexmul_ps(__m128 x, __m128 y)
{ {
__m128 yl, yh, tmp1, tmp2; __m128 yl, yh, tmp1, tmp2;
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
} }
static inline __m128 static inline __m128
_mm_complexconjugatemul_ps(__m128 x, __m128 y) _mm_complexconjugatemul_ps(__m128 x, __m128 y)
{ {
const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f); const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
y = _mm_xor_ps(y, conjugator); // conjugate y y = _mm_xor_ps(y, conjugator); // conjugate y
return _mm_complexmul_ps(x, y); return _mm_complexmul_ps(x, y);
} }
static inline __m128 static inline __m128
_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values {
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
} }
static inline __m128 static inline __m128
_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); {
return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
} }
#endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */ #endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */

View File

@ -27,20 +27,22 @@
#include <xmmintrin.h> #include <xmmintrin.h>
static inline __m128 static inline __m128
_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){ _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
__m128 iValue, qValue; {
// Arrange in i1i2i3i4 format __m128 iValue, qValue;
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0)); // Arrange in i1i2i3i4 format
// Arrange in q1q2q3q4 format iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1)); // Arrange in q1q2q3q4 format
iValue = _mm_mul_ps(iValue, iValue); // Square the I values qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values iValue = _mm_mul_ps(iValue, iValue); // Square the I values
return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
} }
static inline __m128 static inline __m128
_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){ _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); {
return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
} }
#endif /* INCLUDED_VOLK_VOLK_SSE_INTRINSICS_H_ */ #endif /* INCLUDED_VOLK_VOLK_SSE_INTRINSICS_H_ */

View File

@ -45,26 +45,26 @@
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
unsigned int n; unsigned int n;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -74,26 +74,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* resul
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -103,26 +103,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -133,26 +133,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -163,26 +163,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* resu
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -193,26 +193,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* resu
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -223,26 +223,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result,
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -253,30 +253,29 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result,
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points); memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H #endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H

View File

@ -107,7 +107,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -121,7 +122,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -139,13 +140,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -173,7 +174,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -187,7 +189,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -205,13 +207,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -240,7 +242,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -254,7 +257,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -275,13 +278,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -310,7 +313,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -324,7 +328,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -345,13 +349,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -379,7 +383,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -394,7 +399,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -412,13 +417,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -428,7 +433,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -456,7 +461,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -471,7 +477,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -489,13 +495,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -505,7 +511,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -531,7 +537,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_; int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0); const int32x4_t zeros = vdupq_n_s32(0);
@ -539,11 +546,12 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i; int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx; uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec); float32x4_t n0 = vld1q_f32((float*)vec);
int current_correlator_tap; int current_correlator_tap;
unsigned int n; unsigned int n;
@ -553,7 +561,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < neon_iters; n++) for (n = 0; n < neon_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
@ -569,7 +577,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
// fmod // fmod
c = vmulq_f32(aux, reciprocal); c = vmulq_f32(aux, reciprocal);
i = vcvtq_s32_f32(c); i = vcvtq_s32_f32(c);
cTrunc = vcvtq_f32_s32(i); cTrunc = vcvtq_f32_s32(i);
base = vmulq_f32(cTrunc, code_length_chips_reg_f); base = vmulq_f32(cTrunc, code_length_chips_reg_f);
aux = vsubq_f32(aux, base); aux = vsubq_f32(aux, base);
@ -581,13 +589,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = vaddq_f32(indexn, fours); indexn = vaddq_f32(indexn, fours);
} }
for(n = neon_iters * 4; n < num_points; n++) for (n = neon_iters * 4; n < num_points; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap // resample code for current tap
@ -605,4 +613,3 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
#endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/

View File

@ -41,7 +41,7 @@
#include <string.h> #include <string.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345; float rem_carrier_phase_in_rad = 0.345;
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
} }
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345; float rem_carrier_phase_in_rad = 0.345;
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_re
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
} }
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -113,50 +113,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
} }
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // SSE3 #endif // SSE3
//#ifdef LV_HAVE_SSE3 //#ifdef LV_HAVE_SSE3
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{ //{
//// phases must be normalized. Phase rotator expects a complex exponential input! //// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345; //float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1; //float phase_step_rad = 0.1;
//lv_32fc_t phase[1]; //lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1]; //lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n; //unsigned int n;
//int num_a_vectors = 3; //int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//} //}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//volk_gnsssdr_free(in_a[n]); //volk_gnsssdr_free(in_a[n]);
//} //}
//volk_gnsssdr_free(in_a); //volk_gnsssdr_free(in_a);
//} //}
//#endif // SSE3 //#endif // SSE3
@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
} }
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // SSE3 #endif // SSE3
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
@ -206,50 +206,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
} }
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // AVX2 #endif // AVX2
//#ifdef LV_HAVE_AVX2 //#ifdef LV_HAVE_AVX2
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{ //{
//// phases must be normalized. Phase rotator expects a complex exponential input! //// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345; //float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1; //float phase_step_rad = 0.1;
//lv_32fc_t phase[1]; //lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1]; //lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n; //unsigned int n;
//int num_a_vectors = 3; //int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//} //}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//volk_gnsssdr_free(in_a[n]); //volk_gnsssdr_free(in_a[n]);
//} //}
//volk_gnsssdr_free(in_a); //volk_gnsssdr_free(in_a);
//} //}
//#endif // AVX2 //#endif // AVX2
@ -268,50 +268,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
} }
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // AVX2 #endif // AVX2
//#ifdef LV_HAVE_AVX2 //#ifdef LV_HAVE_AVX2
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{ //{
//// phases must be normalized. Phase rotator expects a complex exponential input! //// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345; //float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1; //float phase_step_rad = 0.1;
//lv_32fc_t phase[1]; //lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1]; //lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n; //unsigned int n;
//int num_a_vectors = 3; //int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//} //}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//volk_gnsssdr_free(in_a[n]); //volk_gnsssdr_free(in_a[n]);
//} //}
//volk_gnsssdr_free(in_a); //volk_gnsssdr_free(in_a);
//} //}
//#endif // AVX2 //#endif // AVX2
@ -320,29 +320,29 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
//#ifdef LV_HAVE_NEON //#ifdef LV_HAVE_NEON
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{ //{
//// phases must be normalized. Phase rotator expects a complex exponential input! //// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345; //float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1; //float phase_step_rad = 0.1;
//lv_32fc_t phase[1]; //lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1]; //lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n; //unsigned int n;
//int num_a_vectors = 3; //int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//} //}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//volk_gnsssdr_free(in_a[n]); //volk_gnsssdr_free(in_a[n]);
//} //}
//volk_gnsssdr_free(in_a); //volk_gnsssdr_free(in_a);
//} //}
//#endif // NEON //#endif // NEON
@ -351,34 +351,31 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
//#ifdef LV_HAVE_NEON //#ifdef LV_HAVE_NEON
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) //static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{ //{
//// phases must be normalized. Phase rotator expects a complex exponential input! //// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345; //float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1; //float phase_step_rad = 0.1;
//lv_32fc_t phase[1]; //lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad)); //phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1]; //lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); //phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n; //unsigned int n;
//int num_a_vectors = 3; //int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); //int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment()); //in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points); //memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//} //}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points); //volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++) //for(n = 0; n < num_a_vectors; n++)
//{ //{
//volk_gnsssdr_free(in_a[n]); //volk_gnsssdr_free(in_a[n]);
//} //}
//volk_gnsssdr_free(in_a); //volk_gnsssdr_free(in_a);
//} //}
//#endif // NEON //#endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H

View File

@ -68,7 +68,7 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_generic(lv_16sc_t* cVector,
const lv_16sc_t* aPtr = aVector; const lv_16sc_t* aPtr = aVector;
unsigned int number; unsigned int number;
for(number = 0; number < num_points; number++) for (number = 0; number < num_points; number++)
{ {
*cPtr++ = lv_conj(*aPtr++); *cPtr++ = lv_conj(*aPtr++);
} }
@ -231,4 +231,3 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c
//#endif /* LV_HAVE_NEON */ //#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */ #endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */

View File

@ -63,7 +63,7 @@
static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points) static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{ {
unsigned int i; unsigned int i;
for(i = 0; i < num_points; i++) for (i = 0; i < num_points; i++)
{ {
outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i])); outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
} }
@ -82,9 +82,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector
lv_32fc_t* _out = outputVector; lv_32fc_t* _out = outputVector;
__m128 a; __m128 a;
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm_store_ps((float*)_out, a); _mm_store_ps((float*)_out, a);
_in += 2; _in += 2;
_out += 2; _out += 2;
@ -109,9 +109,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector
lv_32fc_t* _out = outputVector; lv_32fc_t* _out = outputVector;
__m128 a; __m128 a;
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm_storeu_ps((float*)_out, a); _mm_storeu_ps((float*)_out, a);
_in += 2; _in += 2;
_out += 2; _out += 2;
@ -136,15 +136,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector,
lv_32fc_t* _out = outputVector; lv_32fc_t* _out = outputVector;
__m256 a; __m256 a;
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm256_storeu_ps((float*)_out, a); _mm256_storeu_ps((float*)_out, a);
_in += 4; _in += 4;
_out += 4; _out += 4;
} }
_mm256_zeroupper(); _mm256_zeroupper();
for(i = 0; i < (num_points % 4); ++i) for (i = 0; i < (num_points % 4); ++i)
{ {
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++; _in++;
@ -163,15 +163,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector,
lv_32fc_t* _out = outputVector; lv_32fc_t* _out = outputVector;
__m256 a; __m256 a;
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm256_store_ps((float*)_out, a); _mm256_store_ps((float*)_out, a);
_in += 4; _in += 4;
_out += 4; _out += 4;
} }
_mm256_zeroupper(); _mm256_zeroupper();
for(i = 0; i < (num_points % 4); ++i) for (i = 0; i < (num_points % 4); ++i)
{ {
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in)); *_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++; _in++;
@ -194,7 +194,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
int32x4_t a32x4; int32x4_t a32x4;
float32x4_t f32x4; float32x4_t f32x4;
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
a16x4 = vld1_s16((const int16_t*)_in); a16x4 = vld1_s16((const int16_t*)_in);
__VOLK_GNSSSDR_PREFETCH(_in + 4); __VOLK_GNSSSDR_PREFETCH(_in + 4);

View File

@ -78,7 +78,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
// resample code for current tap // resample code for current tap
local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f); local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f);
if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips; if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips; if (local_code_chip_index > (code_length_chips - 1)) local_code_chip_index -= code_length_chips;
result[n] = local_code[local_code_chip_index]; result[n] = local_code[local_code_chip_index];
} }
} }
@ -89,61 +89,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
{ {
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t* _result = result; lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
__m128 _rem_code_phase, _code_phase_step_chips; __m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f; rem_code_phase_chips = rem_code_phase_chips - 0.5f;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
four_times_code_length_chips_minus1[0] = code_length_chips-1; int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[1] = code_length_chips-1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips-1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips-1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_load_ps(init_idx_float); __m128 _4output_index = _mm_load_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_load_ps(init_4constant_float); __m128 _4constant_float = _mm_load_ps(init_4constant_float);
for(number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible //todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[0]];
@ -154,7 +159,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
_4output_index = _mm_add_ps(_4output_index, _4constant_float); _4output_index = _mm_add_ps(_4output_index, _4constant_float);
} }
for(number = quarterPoints * 4; number < num_output_samples; number++) for (number = quarterPoints * 4; number < num_output_samples; number++)
{ {
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
@ -169,61 +174,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
{ {
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t* _result = result; lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
__m128 _rem_code_phase, _code_phase_step_chips; __m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f; rem_code_phase_chips = rem_code_phase_chips - 0.5f;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
four_times_code_length_chips_minus1[0] = code_length_chips-1; int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[1] = code_length_chips-1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips-1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips-1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_loadu_ps(init_idx_float); __m128 _4output_index = _mm_loadu_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float); __m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
for(number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible //todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[0]];
@ -234,7 +244,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
_4output_index = _mm_add_ps(_4output_index, _4constant_float); _4output_index = _mm_add_ps(_4output_index, _4constant_float);
} }
for(number = quarterPoints * 4; number < num_output_samples; number++) for (number = quarterPoints * 4; number < num_output_samples; number++)
{ {
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
@ -249,7 +259,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
#include <arm_neon.h> #include <arm_neon.h>
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float) static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
{ {
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
@ -257,57 +267,62 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
lv_16sc_t* _result = result; lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float32x4_t _rem_code_phase, _code_phase_step_chips; float32x4_t _rem_code_phase, _code_phase_step_chips;
int32x4_t _code_length_chips, _code_length_chips_minus1; int32x4_t _code_length_chips, _code_length_chips_minus1;
float32x4_t _code_phase_out, _code_phase_out_with_offset; float32x4_t _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f; rem_code_phase_chips = rem_code_phase_chips - 0.5f;
float32x4_t sign, PlusHalf, Round; float32x4_t sign, PlusHalf, Round;
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
uint32x4_t negative_indexes, overflow_indexes; uint32x4_t negative_indexes, overflow_indexes;
int32x4_t zero = vmovq_n_s32(0); int32x4_t zero = vmovq_n_s32(0);
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
float32x4_t _4output_index = vld1q_f32(init_idx_float); float32x4_t _4output_index = vld1q_f32(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
float32x4_t _4constant_float = vld1q_f32(init_4constant_float); float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
for(number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
Round = vsubq_f32(PlusHalf, sign); Round = vsubq_f32(PlusHalf, sign);
_code_phase_out_int = vcvtq_s32_f32(Round); _code_phase_out_int = vcvtq_s32_f32(Round);
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int ))); _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg ))); _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible //todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]]; *_result++ = local_code[local_code_chip_index[0]];
@ -318,7 +333,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
_4output_index = vaddq_f32(_4output_index, _4constant_float); _4output_index = vaddq_f32(_4output_index, _4constant_float);
} }
for(number = quarterPoints * 4; number < num_output_samples; number++) for (number = quarterPoints * 4; number < num_output_samples; number++)
{ {
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f); local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;

View File

@ -44,7 +44,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_generic(lv_16sc_t*
float rem_code_phase_chips = -0.123; float rem_code_phase_chips = -0.123;
float code_phase_step_chips = 0.1; float code_phase_step_chips = 0.1;
int code_length_chips = 1023; int code_length_chips = 1023;
volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points); volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -55,7 +55,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_a_sse2(lv_16sc_t*
float rem_code_phase_chips = -0.123; float rem_code_phase_chips = -0.123;
float code_phase_step_chips = 0.1; float code_phase_step_chips = 0.1;
int code_length_chips = 1023; int code_length_chips = 1023;
volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -67,7 +67,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_u_sse2(lv_16sc_t*
float rem_code_phase_chips = -0.123; float rem_code_phase_chips = -0.123;
float code_phase_step_chips = 0.1; float code_phase_step_chips = 0.1;
int code_length_chips = 1023; int code_length_chips = 1023;
volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -79,9 +79,9 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_neon(lv_16sc_t* re
float rem_code_phase_chips = -0.123; float rem_code_phase_chips = -0.123;
float code_phase_step_chips = 0.1; float code_phase_step_chips = 0.1;
int code_length_chips = 1023; int code_length_chips = 1023;
volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points ); volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
} }
#endif /* LV_HAVE_NEON */ #endif /* LV_HAVE_NEON */
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H

View File

@ -49,21 +49,21 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_generic(lv_16sc_
int num_out_vectors = 3; int num_out_vectors = 3;
unsigned int n; unsigned int n;
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
rem_code_phase_chips[n] = -0.234; rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips); volk_gnsssdr_free(rem_code_phase_chips);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -77,22 +77,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_a_sse2(lv_16sc_t
int code_length_chips = 2046; int code_length_chips = 2046;
int num_out_vectors = 3; int num_out_vectors = 3;
unsigned int n; unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
rem_code_phase_chips[n] = -0.234; rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips); volk_gnsssdr_free(rem_code_phase_chips);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -106,22 +106,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_u_sse2(lv_16sc_t
int code_length_chips = 2046; int code_length_chips = 2046;
int num_out_vectors = 3; int num_out_vectors = 3;
unsigned int n; unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
rem_code_phase_chips[n] = -0.234; rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips); volk_gnsssdr_free(rem_code_phase_chips);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -135,26 +135,26 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_neon(lv_16sc_t*
int code_length_chips = 2046; int code_length_chips = 2046;
int num_out_vectors = 3; int num_out_vectors = 3;
unsigned int n; unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment()); float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
rem_code_phase_chips[n] = -0.234; rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips); volk_gnsssdr_free(rem_code_phase_chips);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H

View File

@ -45,26 +45,26 @@
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
unsigned int n; unsigned int n;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -75,26 +75,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* r
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -104,26 +104,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -134,26 +134,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -164,26 +164,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t*
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -194,26 +194,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t*
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -224,26 +224,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -254,29 +254,29 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H

View File

@ -70,7 +70,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
unsigned int i = 0; unsigned int i = 0;
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
for(i = 0; i < (unsigned int)(num_points); ++i) for (i = 0; i < (unsigned int)(num_points); ++i)
{ {
tmp16 = *inVector++; tmp16 = *inVector++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
@ -111,8 +111,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); *outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
} }
// Regenerate phase // Regenerate phase
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase)); //printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
@ -141,11 +141,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
unsigned int number; unsigned int number;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result; __m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -157,49 +159,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples //next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples // store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result); _mm_store_si128((__m128i*)_out, result);
// Regenerate phase // Regenerate phase
@ -232,7 +234,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
@ -244,11 +245,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
unsigned int j; unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result; __m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -265,47 +268,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
{ {
for (j = 0; j < ROTATOR_RELOAD; j++) for (j = 0; j < ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples //next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples // store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result); _mm_store_si128((__m128i*)_out, result);
//next two samples //next two samples
@ -322,47 +325,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples //next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples // store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result); _mm_store_si128((__m128i*)_out, result);
//next two samples //next two samples
@ -385,7 +388,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
@ -395,14 +397,16 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
unsigned int number; unsigned int number;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result; __m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
const lv_16sc_t* _in = inVector; const lv_16sc_t* _in = inVector;
@ -412,49 +416,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples //next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples // store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result); _mm_storeu_si128((__m128i*)_out, result);
// Regenerate phase // Regenerate phase
@ -493,147 +497,149 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points) static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
{ {
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
unsigned int ROTATOR_RELOAD = 512; unsigned int ROTATOR_RELOAD = 512;
unsigned int n; unsigned int n;
unsigned int j; unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result; __m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
two_phase_inc[0] = phase_inc * phase_inc; lv_32fc_t two_phase_inc[2];
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); two_phase_inc[1] = phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
two_phase_acc[0] = (*phase); __VOLK_ATTR_ALIGNED(16)
two_phase_acc[1] = (*phase) * phase_inc; lv_32fc_t two_phase_acc[2];
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
const lv_16sc_t* _in = inVector; const lv_16sc_t* _in = inVector;
lv_16sc_t* _out = outVector; lv_16sc_t* _out = outVector;
__m128 yl, yh, tmp1, tmp2, tmp3; __m128 yl, yh, tmp1, tmp2, tmp3;
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++) for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
{ {
for (j = 0; j < ROTATOR_RELOAD; j++) for (j = 0; j < ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples //next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples // store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result); _mm_storeu_si128((__m128i*)_out, result);
//next two samples //next two samples
_in += 2; _in += 2;
_out += 4; _out += 4;
} }
// Regenerate phase // Regenerate phase
tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg); tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
tmp2 = _mm_hadd_ps(tmp1, tmp1); tmp2 = _mm_hadd_ps(tmp1, tmp1);
tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8); tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
tmp2 = _mm_sqrt_ps(tmp1); tmp2 = _mm_sqrt_ps(tmp1);
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2); two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
} }
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++) for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{ {
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples //next two samples
_in += 2; _in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8); __VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg //complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg //complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples // store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result); _mm_storeu_si128((__m128i*)_out, result);
//next two samples //next two samples
_in += 2; _in += 2;
_out += 4; _out += 4;
} }
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0]; (*phase) = two_phase_acc[0];
for (n = sse_iters * 4; n < num_points; ++n) for (n = sse_iters * 4; n < num_points; ++n)
{ {
tmp16 = *_in++; tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase); tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32))); *_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc; (*phase) *= phase_inc;
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -657,8 +663,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
lv_16sc_t* _out = outVector; lv_16sc_t* _out = outVector;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -667,8 +675,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
@ -681,7 +691,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
if (neon_iters > 0) if (neon_iters > 0)
{ {
for(; i < neon_iters; ++i) for (; i < neon_iters; ++i)
{ {
/* load 4 complex numbers (int 16 bits each component) */ /* load 4 complex numbers (int 16 bits each component) */
tmp16 = vld2_s16((int16_t*)_in); tmp16 = vld2_s16((int16_t*)_in);
@ -745,8 +755,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);
@ -757,7 +769,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); (*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
} }
for(i = 0; i < neon_iters % 4; ++i) for (i = 0; i < neon_iters % 4; ++i)
{ {
tmp16_ = *_in++; tmp16_ = *_in++;
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
@ -791,8 +803,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
lv_16sc_t* _out = outVector; lv_16sc_t* _out = outVector;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -801,8 +815,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
@ -879,8 +895,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);
@ -945,7 +963,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); (*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
} }
for(i = 0; i < neon_iters % 4; ++i) for (i = 0; i < neon_iters % 4; ++i)
{ {
tmp16_ = *_in++; tmp16_ = *_in++;
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase); tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);

View File

@ -73,7 +73,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
lv_16sc_t tmp = in_a[n] * in_b[n]; lv_16sc_t tmp = in_a[n] * in_b[n];
result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) )); result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
} }
} }
@ -96,7 +96,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
if (sse_iters > 0) if (sse_iters > 0)
{ {
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
realcacc = _mm_setzero_si128(); realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128();
@ -104,25 +105,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_a + 8); __VOLK_GNSSSDR_PREFETCH(_in_a + 8);
b = _mm_load_si128((__m128i*)_in_b); b = _mm_load_si128((__m128i*)_in_b);
__VOLK_GNSSSDR_PREFETCH(_in_b + 8); __VOLK_GNSSSDR_PREFETCH(_in_b + 8);
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr); real = _mm_subs_epi16(c, c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
realcacc = _mm_adds_epi16(realcacc, real); realcacc = _mm_adds_epi16(realcacc, real);
imagcacc = _mm_adds_epi16(imagcacc, imag); imagcacc = _mm_adds_epi16(imagcacc, imag);
@ -136,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
a = _mm_or_si128(realcacc, imagcacc); a = _mm_or_si128(realcacc, imagcacc);
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
for (number = 0; number < 4; ++number) for (number = 0; number < 4; ++number)
{ {
@ -174,7 +175,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
if (sse_iters > 0) if (sse_iters > 0)
{ {
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
realcacc = _mm_setzero_si128(); realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128();
@ -182,27 +184,27 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1] //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_a + 8); __VOLK_GNSSSDR_PREFETCH(_in_a + 8);
b = _mm_loadu_si128((__m128i*)_in_b); b = _mm_loadu_si128((__m128i*)_in_b);
__VOLK_GNSSSDR_PREFETCH(_in_b + 8); __VOLK_GNSSSDR_PREFETCH(_in_b + 8);
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr); real = _mm_subs_epi16(c, c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic! imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
realcacc = _mm_adds_epi16(realcacc, real); realcacc = _mm_adds_epi16(realcacc, real);
imagcacc = _mm_adds_epi16(imagcacc, imag); imagcacc = _mm_adds_epi16(imagcacc, imag);
@ -216,7 +218,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
result = _mm_or_si128(realcacc, imagcacc); result = _mm_or_si128(realcacc, imagcacc);
_mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector _mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
@ -253,7 +255,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
if (avx_iters > 0) if (avx_iters > 0)
{ {
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
realcacc = _mm256_setzero_si256(); realcacc = _mm256_setzero_si256();
imagcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256();
@ -261,7 +264,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(number = 0; number < avx_iters; number++) for (number = 0; number < avx_iters; number++)
{ {
a = _mm256_loadu_si256((__m256i*)_in_a); a = _mm256_loadu_si256((__m256i*)_in_a);
__VOLK_GNSSSDR_PREFETCH(_in_a + 16); __VOLK_GNSSSDR_PREFETCH(_in_a + 16);
@ -269,7 +272,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
__VOLK_GNSSSDR_PREFETCH(_in_b + 16); __VOLK_GNSSSDR_PREFETCH(_in_b + 16);
c = _mm256_mullo_epi16(a, b); c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr); real = _mm256_subs_epi16(c, c_sr);
b_sl = _mm256_slli_si256(b, 2); b_sl = _mm256_slli_si256(b, 2);
@ -278,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
imag1 = _mm256_mullo_epi16(a, b_sl); imag1 = _mm256_mullo_epi16(a, b_sl);
imag2 = _mm256_mullo_epi16(b, a_sl); imag2 = _mm256_mullo_epi16(b, a_sl);
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
realcacc = _mm256_adds_epi16(realcacc, real); realcacc = _mm256_adds_epi16(realcacc, real);
imagcacc = _mm256_adds_epi16(imagcacc, imag); imagcacc = _mm256_adds_epi16(imagcacc, imag);
@ -292,7 +295,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc); result = _mm256_or_si256(realcacc, imagcacc);
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector _mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_zeroupper(); _mm256_zeroupper();
for (i = 0; i < 8; ++i) for (i = 0; i < 8; ++i)
@ -330,7 +333,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
if (avx_iters > 0) if (avx_iters > 0)
{ {
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
realcacc = _mm256_setzero_si256(); realcacc = _mm256_setzero_si256();
imagcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256();
@ -338,7 +342,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(number = 0; number < avx_iters; number++) for (number = 0; number < avx_iters; number++)
{ {
a = _mm256_load_si256((__m256i*)_in_a); a = _mm256_load_si256((__m256i*)_in_a);
__VOLK_GNSSSDR_PREFETCH(_in_a + 16); __VOLK_GNSSSDR_PREFETCH(_in_a + 16);
@ -346,7 +350,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
__VOLK_GNSSSDR_PREFETCH(_in_b + 16); __VOLK_GNSSSDR_PREFETCH(_in_b + 16);
c = _mm256_mullo_epi16(a, b); c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr); real = _mm256_subs_epi16(c, c_sr);
b_sl = _mm256_slli_si256(b, 2); b_sl = _mm256_slli_si256(b, 2);
@ -355,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
imag1 = _mm256_mullo_epi16(a, b_sl); imag1 = _mm256_mullo_epi16(a, b_sl);
imag2 = _mm256_mullo_epi16(b, a_sl); imag2 = _mm256_mullo_epi16(b, a_sl);
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic! imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
realcacc = _mm256_adds_epi16(realcacc, real); realcacc = _mm256_adds_epi16(realcacc, real);
imagcacc = _mm256_adds_epi16(imagcacc, imag); imagcacc = _mm256_adds_epi16(imagcacc, imag);
@ -369,7 +373,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc); result = _mm256_or_si256(realcacc, imagcacc);
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector _mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_zeroupper(); _mm256_zeroupper();
for (i = 0; i < 8; ++i) for (i = 0; i < 8; ++i)
@ -397,8 +401,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
unsigned int quarter_points = num_points / 4; unsigned int quarter_points = num_points / 4;
unsigned int number; unsigned int number;
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
*out = lv_cmake((int16_t)0, (int16_t)0); *out = lv_cmake((int16_t)0, (int16_t)0);
if (quarter_points > 0) if (quarter_points > 0)
@ -407,15 +411,16 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
// 2nd lane holds the imaginary part // 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, c_val, accumulator; int16x4x2_t a_val, b_val, c_val, accumulator;
int16x4x2_t tmp_real, tmp_imag; int16x4x2_t tmp_real, tmp_imag;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator.val[0] = vdup_n_s16(0); accumulator.val[0] = vdup_n_s16(0);
accumulator.val[1] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0);
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
for(number = 0; number < quarter_points; ++number) for (number = 0; number < quarter_points; ++number)
{ {
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
@ -451,7 +456,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
} }
// tail case // tail case
for(number = quarter_points * 4; number < num_points; ++number) for (number = quarter_points * 4; number < num_points; ++number)
{ {
*out += (*a_ptr++) * (*b_ptr++); *out += (*a_ptr++) * (*b_ptr++);
} }
@ -468,20 +473,21 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c
unsigned int quarter_points = num_points / 4; unsigned int quarter_points = num_points / 4;
unsigned int number; unsigned int number;
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
// for 2-lane vectors, 1st lane holds the real part, // for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part // 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, accumulator; int16x4x2_t a_val, b_val, accumulator;
int16x4x2_t tmp; int16x4x2_t tmp;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator.val[0] = vdup_n_s16(0); accumulator.val[0] = vdup_n_s16(0);
accumulator.val[1] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0);
for(number = 0; number < quarter_points; ++number) for (number = 0; number < quarter_points; ++number)
{ {
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
@ -503,7 +509,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case // tail case
for(number = quarter_points * 4; number < num_points; ++number) for (number = quarter_points * 4; number < num_points; ++number)
{ {
*out += (*a_ptr++) * (*b_ptr++); *out += (*a_ptr++) * (*b_ptr++);
} }
@ -520,22 +526,23 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
unsigned int quarter_points = num_points / 4; unsigned int quarter_points = num_points / 4;
unsigned int number; unsigned int number;
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a; lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b; lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
// for 2-lane vectors, 1st lane holds the real part, // for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part // 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, accumulator1, accumulator2; int16x4x2_t a_val, b_val, accumulator1, accumulator2;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator1.val[0] = vdup_n_s16(0); accumulator1.val[0] = vdup_n_s16(0);
accumulator1.val[1] = vdup_n_s16(0); accumulator1.val[1] = vdup_n_s16(0);
accumulator2.val[0] = vdup_n_s16(0); accumulator2.val[0] = vdup_n_s16(0);
accumulator2.val[1] = vdup_n_s16(0); accumulator2.val[1] = vdup_n_s16(0);
for(number = 0; number < quarter_points; ++number) for (number = 0; number < quarter_points; ++number)
{ {
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8); __VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8); __VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
@ -556,7 +563,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3]; *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case // tail case
for(number = quarter_points * 4; number < num_points; ++number) for (number = quarter_points * 4; number < num_points; ++number)
{ {
*out += (*a_ptr++) * (*b_ptr++); *out += (*a_ptr++) * (*b_ptr++);
} }

View File

@ -74,7 +74,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu
unsigned int n; unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0,0); result[n_vec] = lv_cmake(0, 0);
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
//r*a.r - i*a.i, i*a.r + r*a.i //r*a.r - i*a.i, i*a.r + r*a.i
@ -96,11 +96,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
unsigned int n; unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0,0); result[n_vec] = lv_cmake(0, 0);
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))), lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), -sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))),
sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n])))); sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n]))));
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp))); result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
} }
} }
@ -112,9 +112,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0, 0);
int n_vec; int n_vec;
unsigned int index; unsigned int index;
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
@ -125,7 +125,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
if (sse_iters > 0) if (sse_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -141,25 +142,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(index = 0; index < sse_iters; index++) for (index = 0; index < sse_iters; index++)
{ {
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr); real = _mm_subs_epi16(c, c_sr);
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
imag = _mm_adds_epi16(c, imag); imag = _mm_adds_epi16(c, imag);
@ -176,12 +177,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index) for (index = 0; index < 4; ++index)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
} }
_out[n_vec] = dotProduct; _out[n_vec] = dotProduct;
} }
@ -191,12 +192,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
for(index = sse_iters * 4; index < num_points; index++) for (index = sse_iters * 4; index < num_points; index++)
{ {
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
@ -206,9 +207,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0, 0);
int n_vec; int n_vec;
unsigned int index; unsigned int index;
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
@ -219,7 +220,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
if (sse_iters > 0) if (sse_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -235,25 +237,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(index = 0; index < sse_iters; index++) for (index = 0; index < sse_iters; index++)
{ {
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr); real = _mm_subs_epi16(c, c_sr);
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i .... c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, .... c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i .... c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
imag = _mm_adds_epi16(c, imag); imag = _mm_adds_epi16(c, imag);
@ -270,12 +272,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]); a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index) for (index = 0; index < 4; ++index)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
} }
_out[n_vec] = dotProduct; _out[n_vec] = dotProduct;
} }
@ -285,12 +287,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
for(index = sse_iters * 4; index < num_points; index++) for (index = sse_iters * 4; index < num_points; index++)
{ {
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
@ -300,9 +302,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0, 0);
int n_vec; int n_vec;
unsigned int index; unsigned int index;
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
@ -313,7 +315,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
if (sse_iters > 0) if (sse_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -329,24 +332,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(index = 0; index < sse_iters; index++) for (index = 0; index < sse_iters; index++)
{ {
b = _mm256_load_si256((__m256i*)_in_common); b = _mm256_load_si256((__m256i*)_in_common);
__VOLK_GNSSSDR_PREFETCH(_in_common + 16); __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index*8])); a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index * 8]));
c = _mm256_mullo_epi16(a, b); c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr); real = _mm256_subs_epi16(c, c_sr);
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
imag = _mm256_adds_epi16(c, imag); imag = _mm256_adds_epi16(c, imag);
@ -363,12 +366,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]);
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (index = 0; index < 8; ++index) for (index = 0; index < 8; ++index)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
} }
_out[n_vec] = dotProduct; _out[n_vec] = dotProduct;
} }
@ -379,12 +382,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
for(index = sse_iters * 8; index < num_points; index++) for (index = sse_iters * 8; index < num_points; index++)
{ {
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
@ -394,9 +397,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0, 0);
const unsigned int sse_iters = num_points / 8; const unsigned int sse_iters = num_points / 8;
int n_vec; int n_vec;
@ -407,7 +410,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
if (sse_iters > 0) if (sse_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -423,24 +427,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(index = 0; index < sse_iters; index++) for (index = 0; index < sse_iters; index++)
{ {
b = _mm256_loadu_si256((__m256i*)_in_common); b = _mm256_loadu_si256((__m256i*)_in_common);
__VOLK_GNSSSDR_PREFETCH(_in_common + 16); __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index*8])); a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index * 8]));
c = _mm256_mullo_epi16(a, b); c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr); real = _mm256_subs_epi16(c, c_sr);
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i .... c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, .... c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i .... c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, .... imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
imag = _mm256_adds_epi16(c, imag); imag = _mm256_adds_epi16(c, imag);
@ -457,12 +461,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]); a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]);
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector _mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (index = 0; index < 8; ++index) for (index = 0; index < 8; ++index)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
} }
_out[n_vec] = dotProduct; _out[n_vec] = dotProduct;
} }
@ -473,12 +477,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
for(index = sse_iters * 8; index < num_points; index++) for (index = sse_iters * 8; index < num_points; index++)
{ {
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
@ -488,9 +492,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
#include <arm_neon.h> #include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0, 0);
int n_vec; int n_vec;
unsigned int index; unsigned int index;
const unsigned int neon_iters = num_points / 4; const unsigned int neon_iters = num_points / 4;
@ -501,7 +505,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
if (neon_iters > 0) if (neon_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val, c_val; int16x4x2_t a_val, b_val, c_val;
@ -509,19 +514,19 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
int16x4x2_t tmp_real, tmp_imag; int16x4x2_t tmp_real, tmp_imag;
for(n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0);
} }
for(index = 0; index < neon_iters; index++) for (index = 0; index < neon_iters; index++)
{ {
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8); //__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8);
// multiply the real*real and imag*imag to get real result // multiply the real*real and imag*imag to get real result
@ -547,12 +552,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index) for (index = 0; index < 4; ++index)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
} }
_out[n_vec] = dotProduct; _out[n_vec] = dotProduct;
} }
@ -561,12 +566,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
for(index = neon_iters * 4; index < num_points; index++) for (index = neon_iters * 4; index < num_points; index++)
{ {
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
@ -576,9 +581,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
#include <arm_neon.h> #include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0, 0);
const unsigned int neon_iters = num_points / 4; const unsigned int neon_iters = num_points / 4;
int n_vec; int n_vec;
@ -589,25 +594,26 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
if (neon_iters > 0) if (neon_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val, tmp; int16x4x2_t a_val, b_val, tmp;
int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
accumulator[n_vec].val[0] = vdup_n_s16(0); accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0); accumulator[n_vec].val[1] = vdup_n_s16(0);
} }
for(index = 0; index < neon_iters; index++) for (index = 0; index < neon_iters; index++)
{ {
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4]));
tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]); tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]); tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
@ -624,12 +630,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index) for (index = 0; index < 4; ++index)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
} }
_out[n_vec] = dotProduct; _out[n_vec] = dotProduct;
} }
@ -638,12 +644,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
for(index = neon_iters * 4; index < num_points; index++) for (index = neon_iters * 4; index < num_points; index++)
{ {
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
@ -653,9 +659,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
#include <arm_neon.h> #include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_16sc_t dotProduct = lv_cmake(0,0); lv_16sc_t dotProduct = lv_cmake(0, 0);
const unsigned int neon_iters = num_points / 4; const unsigned int neon_iters = num_points / 4;
int n_vec; int n_vec;
@ -666,14 +672,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
if (neon_iters > 0) if (neon_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val; int16x4x2_t a_val, b_val;
int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment()); int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
accumulator1[n_vec].val[0] = vdup_n_s16(0); accumulator1[n_vec].val[0] = vdup_n_s16(0);
accumulator1[n_vec].val[1] = vdup_n_s16(0); accumulator1[n_vec].val[1] = vdup_n_s16(0);
@ -681,13 +688,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
accumulator2[n_vec].val[1] = vdup_n_s16(0); accumulator2[n_vec].val[1] = vdup_n_s16(0);
} }
for(index = 0; index < neon_iters; index++) for (index = 0; index < neon_iters; index++)
{ {
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8); __VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4]));
accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]); accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]);
accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]); accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]);
@ -705,12 +712,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index) for (index = 0; index < 4; ++index)
{ {
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])), dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index]))); sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
} }
_out[n_vec] = dotProduct; _out[n_vec] = dotProduct;
} }
@ -720,12 +727,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
for(index = neon_iters * 4; index < num_points; index++) for (index = neon_iters * 4; index < num_points; index++)
{ {
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index]; lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)), _out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }

View File

@ -47,22 +47,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t*
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif /* Generic */ #endif /* Generic */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -71,22 +71,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic_sat(lv_16sc
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif /* Generic */ #endif /* Generic */
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
@ -95,18 +95,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
@ -120,18 +120,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
@ -145,18 +145,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* r
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
@ -170,18 +170,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* r
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
@ -195,22 +195,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // NEON #endif // NEON
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
@ -220,22 +220,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t*
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // NEON #endif // NEON
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
@ -244,23 +244,21 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n; unsigned int n;
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // NEON #endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H

View File

@ -91,29 +91,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con
const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b; const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out; lv_16sc_t* _out = out;
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1] //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_load_si128((__m128i*)_in_b); b = _mm_load_si128((__m128i*)_in_b);
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr); real = _mm_subs_epi16(c, c_sr);
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); imag = _mm_adds_epi16(imag1, imag2);
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm_or_si128 (real, imag); result = _mm_or_si128(real, imag);
_mm_store_si128((__m128i*)_out, result); _mm_store_si128((__m128i*)_out, result);
@ -137,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
{ {
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
unsigned int number; unsigned int number;
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
@ -145,29 +145,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
const lv_16sc_t* _in_a = in_a; const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b; const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out; lv_16sc_t* _out = out;
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i] //std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1] //imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r] // a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_loadu_si128((__m128i*)_in_b); b = _mm_loadu_si128((__m128i*)_in_b);
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, .... c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr); real = _mm_subs_epi16(c, c_sr);
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i .... b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i .... a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); imag = _mm_adds_epi16(imag1, imag2);
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm_or_si128 (real, imag); result = _mm_or_si128(real, imag);
_mm_storeu_si128((__m128i*)_out, result); _mm_storeu_si128((__m128i*)_out, result);
@ -196,29 +196,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
const lv_16sc_t* _in_b = in_b; const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out; lv_16sc_t* _out = out;
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(;number < avx2_points; number++) for (; number < avx2_points; number++)
{ {
a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
c = _mm256_mullo_epi16(a, b); c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr); real = _mm256_subs_epi16(c, c_sr);
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm256_adds_epi16(imag1, imag2); imag = _mm256_adds_epi16(imag1, imag2);
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm256_or_si256(real, imag); result = _mm256_or_si256(real, imag);
@ -230,7 +230,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
} }
_mm256_zeroupper(); _mm256_zeroupper();
number = avx2_points * 8; number = avx2_points * 8;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
*_out++ = (*_in_a++) * (*_in_b++); *_out++ = (*_in_a++) * (*_in_b++);
} }
@ -250,29 +250,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
const lv_16sc_t* _in_b = in_b; const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out; lv_16sc_t* _out = out;
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result; __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0); const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF); const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(;number < avx2_points; number++) for (; number < avx2_points; number++)
{ {
a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
c = _mm256_mullo_epi16(a, b); c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst. c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr); real = _mm256_subs_epi16(c, c_sr);
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i .... b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i .... a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, .... imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, .... imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm256_adds_epi16(imag1, imag2); imag = _mm256_adds_epi16(imag1, imag2);
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ... imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm256_or_si256(real, imag); result = _mm256_or_si256(real, imag);
@ -284,7 +284,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
} }
_mm256_zeroupper(); _mm256_zeroupper();
number = avx2_points * 8; number = avx2_points * 8;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
*_out++ = (*_in_a++) * (*_in_b++); *_out++ = (*_in_a++) * (*_in_b++);
} }
@ -292,23 +292,22 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
#include <arm_neon.h> #include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{ {
lv_16sc_t *a_ptr = (lv_16sc_t*) in_a; lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
lv_16sc_t *b_ptr = (lv_16sc_t*) in_b; lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
unsigned int quarter_points = num_points / 4; unsigned int quarter_points = num_points / 4;
int16x4x2_t a_val, b_val, c_val; int16x4x2_t a_val, b_val, c_val;
int16x4x2_t tmp_real, tmp_imag; int16x4x2_t tmp_real, tmp_imag;
unsigned int number = 0; unsigned int number = 0;
for(number = 0; number < quarter_points; ++number) for (number = 0; number < quarter_points; ++number)
{ {
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
__VOLK_GNSSSDR_PREFETCH(a_ptr + 4); __VOLK_GNSSSDR_PREFETCH(a_ptr + 4);
__VOLK_GNSSSDR_PREFETCH(b_ptr + 4); __VOLK_GNSSSDR_PREFETCH(b_ptr + 4);
@ -334,7 +333,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const
out += 4; out += 4;
} }
for(number = quarter_points * 4; number < num_points; number++) for (number = quarter_points * 4; number < num_points; number++)
{ {
*out++ = (*a_ptr++) * (*b_ptr++); *out++ = (*a_ptr++) * (*b_ptr++);
} }

View File

@ -41,7 +41,7 @@
#include <string.h> #include <string.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345; float rem_carrier_phase_in_rad = 0.345;
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points) static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345; float rem_carrier_phase_in_rad = 0.345;
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_rel
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -113,22 +113,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // SSE3 #endif // SSE3
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
@ -144,22 +144,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3_relo
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // SSE3 #endif // SSE3
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // SSE3 #endif // SSE3
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
@ -206,22 +206,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2(lv_1
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // AVX2 #endif // AVX2
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
@ -237,22 +237,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2_relo
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // AVX2 #endif // AVX2
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
@ -268,22 +268,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2(lv_1
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // AVX2 #endif // AVX2
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
@ -299,22 +299,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_relo
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // AVX2 #endif // AVX2
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
@ -330,22 +330,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16s
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // NEON #endif // NEON
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
@ -361,23 +361,21 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points); memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
} }
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
volk_gnsssdr_free(in_a); volk_gnsssdr_free(in_a);
} }
#endif // NEON #endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H

View File

@ -106,7 +106,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -120,7 +121,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -138,13 +139,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -172,7 +173,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -186,7 +188,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -204,13 +206,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -239,7 +241,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -253,7 +256,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -274,13 +277,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -309,7 +312,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -323,7 +327,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -344,13 +348,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -378,7 +382,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -393,7 +398,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -411,13 +416,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -427,7 +432,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -455,7 +460,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -470,7 +476,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -488,13 +494,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -504,7 +510,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -530,7 +536,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_; int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0); const int32x4_t zeros = vdupq_n_s32(0);
@ -538,11 +545,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i; int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx; uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec); float32x4_t n0 = vld1q_f32((float*)vec);
int current_correlator_tap; int current_correlator_tap;
unsigned int n; unsigned int n;
@ -552,7 +560,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < neon_iters; n++) for (n = 0; n < neon_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
@ -568,7 +576,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
// fmod // fmod
c = vmulq_f32(aux, reciprocal); c = vmulq_f32(aux, reciprocal);
i = vcvtq_s32_f32(c); i = vcvtq_s32_f32(c);
cTrunc = vcvtq_f32_s32(i); cTrunc = vcvtq_f32_s32(i);
base = vmulq_f32(cTrunc, code_length_chips_reg_f); base = vmulq_f32(cTrunc, code_length_chips_reg_f);
aux = vsubq_f32(aux, base); aux = vsubq_f32(aux, base);
@ -580,13 +588,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = vaddq_f32(indexn, fours); indexn = vaddq_f32(indexn, fours);
} }
for(n = neon_iters * 4; n < num_points; n++) for (n = neon_iters * 4; n < num_points; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap // resample code for current tap
@ -604,4 +612,3 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/

View File

@ -95,69 +95,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{ {
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t** _result = result; lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips; float tmp_rem_code_phase_chips;
__m128 _rem_code_phase,_code_phase_step_chips; __m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips,_code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out,_code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over; __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_load_ps(init_idx_float); __m128 _4output_index = _mm_load_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_load_ps(init_4constant_float); __m128 _4constant_float = _mm_load_ps(init_4constant_float);
int current_vector = 0; int current_vector = 0;
int sample_idx = 0; int sample_idx = 0;
for(number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
//common to all outputs //common to all outputs
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
//output vector dependant (different code phase offset) //output vector dependant (different code phase offset)
for(current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back _mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible //todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
@ -169,9 +174,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
sample_idx += 4; sample_idx += 4;
} }
for(number = quarterPoints * 4; number < num_output_samples; number++) for (number = quarterPoints * 4; number < num_output_samples; number++)
{ {
for(current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
@ -186,69 +191,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include <emmintrin.h> #include <emmintrin.h>
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{ {
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t** _result = result; lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips; float tmp_rem_code_phase_chips;
__m128 _rem_code_phase,_code_phase_step_chips; __m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips,_code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out,_code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register _code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register _code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over; __m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_loadu_ps(init_idx_float); __m128 _4output_index = _mm_loadu_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float); __m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
int current_vector = 0; int current_vector = 0;
int sample_idx = 0; int sample_idx = 0;
for(number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
//common to all outputs //common to all outputs
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
//output vector dependant (different code phase offset) //output vector dependant (different code phase offset)
for(current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer _code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int ))); _code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg ))); _code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back _mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible //todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
@ -260,9 +270,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
sample_idx += 4; sample_idx += 4;
} }
for(number = quarterPoints * 4; number < num_output_samples; number++) for (number = quarterPoints * 4; number < num_output_samples; number++)
{ {
for(current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
@ -278,74 +288,79 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
#include <arm_neon.h> #include <arm_neon.h>
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples) static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{ {
unsigned int number; unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
float32x4_t half = vdupq_n_f32(0.5f); float32x4_t half = vdupq_n_f32(0.5f);
lv_16sc_t** _result = result; lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips; float tmp_rem_code_phase_chips;
float32x4_t _rem_code_phase, _code_phase_step_chips; float32x4_t _rem_code_phase, _code_phase_step_chips;
int32x4_t _code_length_chips, _code_length_chips_minus1; int32x4_t _code_length_chips, _code_length_chips_minus1;
float32x4_t _code_phase_out, _code_phase_out_with_offset; float32x4_t _code_phase_out, _code_phase_out_with_offset;
float32x4_t sign, PlusHalf, Round; float32x4_t sign, PlusHalf, Round;
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips; four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register _code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register _code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over; int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
uint32x4_t negative_indexes, overflow_indexes; uint32x4_t negative_indexes, overflow_indexes;
int32x4_t zero = vmovq_n_s32(0); int32x4_t zero = vmovq_n_s32(0);
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
float32x4_t _4output_index = vld1q_f32(init_idx_float); float32x4_t _4output_index = vld1q_f32(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
float32x4_t _4constant_float = vld1q_f32(init_4constant_float); float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
int current_vector = 0; int current_vector = 0;
int sample_idx = 0; int sample_idx = 0;
for(number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
{ {
//common to all outputs //common to all outputs
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step _code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
//output vector dependant (different code phase offset) //output vector dependant (different code phase offset)
for(current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0) tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register _rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset _code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
//_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer //_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half); PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
Round = vsubq_f32(PlusHalf, sign); Round = vsubq_f32(PlusHalf, sign);
_code_phase_out_int = vcvtq_s32_f32(Round); _code_phase_out_int = vcvtq_s32_f32(Round);
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch _code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int ))); _code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch _code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg ))); _code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible //todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]]; _result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
@ -357,9 +372,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
sample_idx += 4; sample_idx += 4;
} }
for(number = quarterPoints * 4; number < num_output_samples; number++) for (number = quarterPoints * 4; number < num_output_samples; number++)
{ {
for(current_vector = 0; current_vector < num_out_vectors; current_vector++) for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{ {
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]); local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1; if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;

View File

@ -29,7 +29,6 @@
*/ */
/*! /*!
* \page volk_gnsssdr_32f_index_max_32u.h * \page volk_gnsssdr_32f_index_max_32u.h
* *
@ -63,7 +62,7 @@
static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points) static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
uint32_t number = 0; uint32_t number = 0;
const uint32_t quarterPoints = num_points / 8; const uint32_t quarterPoints = num_points / 8;
@ -71,7 +70,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
float* inputPtr = (float*)src0; float* inputPtr = (float*)src0;
__m256 indexIncrementValues = _mm256_set1_ps(8); __m256 indexIncrementValues = _mm256_set1_ps(8);
__m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
float max = src0[0]; float max = src0[0];
float index = 0; float index = 0;
@ -80,25 +79,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
__m256 compareResults; __m256 compareResults;
__m256 currentValues; __m256 currentValues;
__VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; float maxValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32)
float maxIndexesBuffer[8];
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; currentValues = _mm256_load_ps(inputPtr);
inputPtr += 8;
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
} }
// Calculate the largest value from the remaining 8 points // Calculate the largest value from the remaining 8 points
_mm256_store_ps(maxValuesBuffer, maxValues); _mm256_store_ps(maxValuesBuffer, maxValues);
_mm256_store_ps(maxIndexesBuffer, maxValuesIndex); _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 8; number++) for (number = 0; number < 8; number++)
{ {
if(maxValuesBuffer[number] > max) if (maxValuesBuffer[number] > max)
{ {
index = maxIndexesBuffer[number]; index = maxIndexesBuffer[number];
max = maxValuesBuffer[number]; max = maxValuesBuffer[number];
@ -106,9 +108,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
} }
number = quarterPoints * 8; number = quarterPoints * 8;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
if(src0[number] > max) if (src0[number] > max)
{ {
index = number; index = number;
max = src0[number]; max = src0[number];
@ -126,7 +128,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points) static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
uint32_t number = 0; uint32_t number = 0;
const uint32_t quarterPoints = num_points / 8; const uint32_t quarterPoints = num_points / 8;
@ -134,7 +136,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
float* inputPtr = (float*)src0; float* inputPtr = (float*)src0;
__m256 indexIncrementValues = _mm256_set1_ps(8); __m256 indexIncrementValues = _mm256_set1_ps(8);
__m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8); __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
float max = src0[0]; float max = src0[0];
float index = 0; float index = 0;
@ -143,25 +145,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
__m256 compareResults; __m256 compareResults;
__m256 currentValues; __m256 currentValues;
__VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; float maxValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32)
float maxIndexesBuffer[8];
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; currentValues = _mm256_loadu_ps(inputPtr);
inputPtr += 8;
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults); maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
} }
// Calculate the largest value from the remaining 8 points // Calculate the largest value from the remaining 8 points
_mm256_store_ps(maxValuesBuffer, maxValues); _mm256_store_ps(maxValuesBuffer, maxValues);
_mm256_store_ps(maxIndexesBuffer, maxValuesIndex); _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 8; number++) for (number = 0; number < 8; number++)
{ {
if(maxValuesBuffer[number] > max) if (maxValuesBuffer[number] > max)
{ {
index = maxIndexesBuffer[number]; index = maxIndexesBuffer[number];
max = maxValuesBuffer[number]; max = maxValuesBuffer[number];
@ -169,9 +174,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
} }
number = quarterPoints * 8; number = quarterPoints * 8;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
if(src0[number] > max) if (src0[number] > max)
{ {
index = number; index = number;
max = src0[number]; max = src0[number];
@ -185,11 +190,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
#ifdef LV_HAVE_SSE4_1 #ifdef LV_HAVE_SSE4_1
#include<smmintrin.h> #include <smmintrin.h>
static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
uint32_t number = 0; uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4; const uint32_t quarterPoints = num_points / 4;
@ -197,7 +202,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
float* inputPtr = (float*)src0; float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4); __m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
float max = src0[0]; float max = src0[0];
float index = 0; float index = 0;
@ -206,25 +211,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
__m128 compareResults; __m128 compareResults;
__m128 currentValues; __m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm_load_ps(inputPtr); inputPtr += 4; currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
} }
// Calculate the largest value from the remaining 4 points // Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex); _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++) for (number = 0; number < 4; number++)
{ {
if(maxValuesBuffer[number] > max) if (maxValuesBuffer[number] > max)
{ {
index = maxIndexesBuffer[number]; index = maxIndexesBuffer[number];
max = maxValuesBuffer[number]; max = maxValuesBuffer[number];
@ -232,9 +240,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
} }
number = quarterPoints * 4; number = quarterPoints * 4;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
if(src0[number] > max) if (src0[number] > max)
{ {
index = number; index = number;
max = src0[number]; max = src0[number];
@ -248,11 +256,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
#ifdef LV_HAVE_SSE4_1 #ifdef LV_HAVE_SSE4_1
#include<smmintrin.h> #include <smmintrin.h>
static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points) static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
uint32_t number = 0; uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4; const uint32_t quarterPoints = num_points / 4;
@ -260,7 +268,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
float* inputPtr = (float*)src0; float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4); __m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
float max = src0[0]; float max = src0[0];
float index = 0; float index = 0;
@ -269,25 +277,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
__m128 compareResults; __m128 compareResults;
__m128 currentValues; __m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; currentValues = _mm_loadu_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults); maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
} }
// Calculate the largest value from the remaining 4 points // Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex); _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++) for (number = 0; number < 4; number++)
{ {
if(maxValuesBuffer[number] > max) if (maxValuesBuffer[number] > max)
{ {
index = maxIndexesBuffer[number]; index = maxIndexesBuffer[number];
max = maxValuesBuffer[number]; max = maxValuesBuffer[number];
@ -295,9 +306,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
} }
number = quarterPoints * 4; number = quarterPoints * 4;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
if(src0[number] > max) if (src0[number] > max)
{ {
index = number; index = number;
max = src0[number]; max = src0[number];
@ -312,11 +323,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include<xmmintrin.h> #include <xmmintrin.h>
static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points) static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
uint32_t number = 0; uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4; const uint32_t quarterPoints = num_points / 4;
@ -324,7 +335,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
float* inputPtr = (float*)src0; float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4); __m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
float max = src0[0]; float max = src0[0];
float index = 0; float index = 0;
@ -333,25 +344,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
__m128 compareResults; __m128 compareResults;
__m128 currentValues; __m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm_load_ps(inputPtr); inputPtr += 4; currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes));
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues));
} }
// Calculate the largest value from the remaining 4 points // Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex); _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++) for (number = 0; number < 4; number++)
{ {
if(maxValuesBuffer[number] > max) if (maxValuesBuffer[number] > max)
{ {
index = maxIndexesBuffer[number]; index = maxIndexesBuffer[number];
max = maxValuesBuffer[number]; max = maxValuesBuffer[number];
@ -359,9 +373,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
} }
number = quarterPoints * 4; number = quarterPoints * 4;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
if(src0[number] > max) if (src0[number] > max)
{ {
index = number; index = number;
max = src0[number]; max = src0[number];
@ -376,11 +390,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
#ifdef LV_HAVE_SSE #ifdef LV_HAVE_SSE
#include<xmmintrin.h> #include <xmmintrin.h>
static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points) static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
uint32_t number = 0; uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4; const uint32_t quarterPoints = num_points / 4;
@ -388,7 +402,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
float* inputPtr = (float*)src0; float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4); __m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4); __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
float max = src0[0]; float max = src0[0];
float index = 0; float index = 0;
@ -397,25 +411,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
__m128 compareResults; __m128 compareResults;
__m128 currentValues; __m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; currentValues = _mm_loadu_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes)); maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes));
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues)); maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues));
} }
// Calculate the largest value from the remaining 4 points // Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues); _mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex); _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++) for (number = 0; number < 4; number++)
{ {
if(maxValuesBuffer[number] > max) if (maxValuesBuffer[number] > max)
{ {
index = maxIndexesBuffer[number]; index = maxIndexesBuffer[number];
max = maxValuesBuffer[number]; max = maxValuesBuffer[number];
@ -423,9 +440,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
} }
number = quarterPoints * 4; number = quarterPoints * 4;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
if(src0[number] > max) if (src0[number] > max)
{ {
index = number; index = number;
max = src0[number]; max = src0[number];
@ -442,16 +459,16 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points) static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
float max = src0[0]; float max = src0[0];
uint32_t index = 0; uint32_t index = 0;
uint32_t i = 1; uint32_t i = 1;
for(; i < num_points; ++i) for (; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -469,14 +486,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, cons
static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points) static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
uint32_t number = 0; uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4; const uint32_t quarterPoints = num_points / 4;
float* inputPtr = (float*)src0; float* inputPtr = (float*)src0;
float32x4_t indexIncrementValues = vdupq_n_f32(4); float32x4_t indexIncrementValues = vdupq_n_f32(4);
__VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; __VOLK_ATTR_ALIGNED(16)
float currentIndexes_float[4] = {-4.0f, -3.0f, -2.0f, -1.0f};
float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
float max = src0[0]; float max = src0[0];
@ -487,25 +505,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
uint32x4_t currentIndexes_u; uint32x4_t currentIndexes_u;
float32x4_t currentValues; float32x4_t currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = vld1q_f32(inputPtr); inputPtr += 4; currentValues = vld1q_f32(inputPtr);
currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); inputPtr += 4;
currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
currentIndexes_u = vcvtq_u32_f32(currentIndexes); currentIndexes_u = vcvtq_u32_f32(currentIndexes);
compareResults = vcgtq_f32( maxValues, currentValues); compareResults = vcgtq_f32(maxValues, currentValues);
maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) ); maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex), vbicq_u32(currentIndexes_u, compareResults));
maxValues = vmaxq_f32(currentValues, maxValues); maxValues = vmaxq_f32(currentValues, maxValues);
} }
// Calculate the largest value from the remaining 4 points // Calculate the largest value from the remaining 4 points
vst1q_f32(maxValuesBuffer, maxValues); vst1q_f32(maxValuesBuffer, maxValues);
vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex)); vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
for(number = 0; number < 4; number++) for (number = 0; number < 4; number++)
{ {
if(maxValuesBuffer[number] > max) if (maxValuesBuffer[number] > max)
{ {
index = maxIndexesBuffer[number]; index = maxIndexesBuffer[number];
max = maxValuesBuffer[number]; max = maxValuesBuffer[number];
@ -513,9 +534,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
} }
number = quarterPoints * 4; number = quarterPoints * 4;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
if(src0[number] > max) if (src0[number] > max)
{ {
index = number; index = number;
max = src0[number]; max = src0[number];
@ -528,4 +549,3 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
#endif /*LV_HAVE_NEON*/ #endif /*LV_HAVE_NEON*/
#endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/ #endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/

View File

@ -42,31 +42,30 @@
#include <string.h> #include <string.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -77,26 +76,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result,
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -106,26 +105,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result,
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -136,26 +135,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result,
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -165,26 +164,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -194,26 +193,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
@ -223,26 +222,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, c
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
@ -251,29 +250,28 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, c
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment()); float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points); memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H #endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H

View File

@ -97,7 +97,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
cp4 = _mm_set1_ps(0.49603e-4); cp4 = _mm_set1_ps(0.49603e-4);
cp5 = _mm_set1_ps(0.551e-6); cp5 = _mm_set1_ps(0.551e-6);
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
aVal = _mm_loadu_ps(aPtr); aVal = _mm_loadu_ps(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8); __VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -108,12 +108,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
s = _mm_mul_ps(s, s); s = _mm_mul_ps(s, s);
// Evaluate Taylor series // Evaluate Taylor series
s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
for(i = 0; i < 3; i++) for (i = 0; i < 3; i++)
{ {
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
} }
@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
} }
number = quarterPoints * 4; number = quarterPoints * 4;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
float _in = *aPtr++; float _in = *aPtr++;
*bPtr++ = lv_cmake(cosf(_in), sinf(_in)); *bPtr++ = lv_cmake(cosf(_in), sinf(_in));
@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
cp4 = _mm_set1_ps(0.49603e-4); cp4 = _mm_set1_ps(0.49603e-4);
cp5 = _mm_set1_ps(0.551e-6); cp5 = _mm_set1_ps(0.551e-6);
for(;number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
aVal = _mm_load_ps(aPtr); aVal = _mm_load_ps(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8); __VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -202,12 +202,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A)); s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B)); s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
s = _mm_mul_ps(s, s); s = _mm_mul_ps(s, s);
// Evaluate Taylor series // Evaluate Taylor series
s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s); s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
for(i = 0; i < 3; i++) for (i = 0; i < 3; i++)
{ {
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s)); s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
} }
@ -239,7 +239,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
} }
number = quarterPoints * 4; number = quarterPoints * 4;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
float _in = *aPtr++; float _in = *aPtr++;
*bPtr++ = lv_cmake(cosf(_in), sinf(_in)); *bPtr++ = lv_cmake(cosf(_in), sinf(_in));
@ -265,31 +265,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
__m128 sine, cosine, aux, x; __m128 sine, cosine, aux, x;
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
__m128i emm0, emm2, emm4; __m128i emm0, emm2, emm4;
/* declare some SSE constants */ /* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; __VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
for(;number < sse_iters; number++) for (; number < sse_iters; number++)
{ {
x = _mm_load_ps(aPtr); x = _mm_load_ps(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8); __VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -307,19 +325,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
emm2 = _mm_cvttps_epi32(y); emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */ /* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1); emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2); y = _mm_cvtepi32_ps(emm2);
emm4 = emm2; emm4 = emm2;
/* get the swap sign flag for the sine */ /* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4); emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29); emm0 = _mm_slli_epi32(emm0, 29);
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/ /* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 poly_mask = _mm_castsi128_ps(emm2); __m128 poly_mask = _mm_castsi128_ps(emm2);
@ -335,15 +353,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3); x = _mm_add_ps(x, xmm3);
emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2); emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
emm4 = _mm_slli_epi32(emm4, 29); emm4 = _mm_slli_epi32(emm4, 29);
__m128 sign_bit_cos = _mm_castsi128_ps(emm4); __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x); __m128 z = _mm_mul_ps(x, x);
y = *(__m128*)_ps_coscof_p0; y = *(__m128*)_ps_coscof_p0;
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
@ -371,11 +389,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
xmm3 = poly_mask; xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y); __m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2); y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1); y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2); xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y,y2); xmm2 = _mm_add_ps(y, y2);
/* update the sign */ /* update the sign */
sine = _mm_xor_ps(xmm1, sign_bit_sin); sine = _mm_xor_ps(xmm1, sign_bit_sin);
@ -392,12 +410,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
aPtr += 4; aPtr += 4;
} }
for(number = sse_iters * 4; number < num_points; number++) for (number = sse_iters * 4; number < num_points; number++)
{ {
_in = *aPtr++; _in = *aPtr++;
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -418,31 +435,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
__m128 sine, cosine, aux, x; __m128 sine, cosine, aux, x;
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
__m128i emm0, emm2, emm4; __m128i emm0, emm2, emm4;
/* declare some SSE constants */ /* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; __VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
for(;number < sse_iters; number++) for (; number < sse_iters; number++)
{ {
x = _mm_loadu_ps(aPtr); x = _mm_loadu_ps(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8); __VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -460,19 +495,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
emm2 = _mm_cvttps_epi32(y); emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */ /* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1); emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2); y = _mm_cvtepi32_ps(emm2);
emm4 = emm2; emm4 = emm2;
/* get the swap sign flag for the sine */ /* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4); emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29); emm0 = _mm_slli_epi32(emm0, 29);
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/ /* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 poly_mask = _mm_castsi128_ps(emm2); __m128 poly_mask = _mm_castsi128_ps(emm2);
@ -488,15 +523,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3); x = _mm_add_ps(x, xmm3);
emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2); emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
emm4 = _mm_slli_epi32(emm4, 29); emm4 = _mm_slli_epi32(emm4, 29);
__m128 sign_bit_cos = _mm_castsi128_ps(emm4); __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x); __m128 z = _mm_mul_ps(x, x);
y = *(__m128*)_ps_coscof_p0; y = *(__m128*)_ps_coscof_p0;
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
@ -524,11 +559,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
xmm3 = poly_mask; xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y); __m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2); y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1); y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2); xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y,y2); xmm2 = _mm_add_ps(y, y2);
/* update the sign */ /* update the sign */
sine = _mm_xor_ps(xmm1, sign_bit_sin); sine = _mm_xor_ps(xmm1, sign_bit_sin);
@ -545,12 +580,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
aPtr += 4; aPtr += 4;
} }
for(number = sse_iters * 4; number < num_points; number++) for (number = sse_iters * 4; number < num_points; number++)
{ {
_in = *aPtr++; _in = *aPtr++;
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -561,10 +595,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic(lv_32fc_t* out, const fl
{ {
float _in; float _in;
unsigned int i; unsigned int i;
for(i = 0; i < num_points; i++) for (i = 0; i < num_points; i++)
{ {
_in = *in++; _in = *in++;
*out++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); *out++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
} }
} }
@ -586,12 +620,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
const int32_t diffbits = bitlength - Nbits; const int32_t diffbits = bitlength - Nbits;
uint32_t ux; uint32_t ux;
unsigned int i; unsigned int i;
for(i = 0; i < num_points; i++) for (i = 0; i < num_points; i++)
{ {
_in = *in++; _in = *in++;
d = (int32_t)floor(_in / TWO_PI + 0.5); d = (int32_t)floor(_in / TWO_PI + 0.5);
_in -= d * TWO_PI; _in -= d * TWO_PI;
x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI); x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI);
ux = x; ux = x;
sin_index = ux >> diffbits; sin_index = ux >> diffbits;
@ -601,7 +635,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
cos_index = ux >> diffbits; cos_index = ux >> diffbits;
c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1]; c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1];
*out++ = lv_cmake((float)c, (float)s ); *out++ = lv_cmake((float)c, (float)s);
} }
} }
@ -637,7 +671,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos; uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos;
for(;number < neon_iters; number++) for (; number < neon_iters; number++)
{ {
x = vld1q_f32(aPtr); x = vld1q_f32(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8); __VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -677,7 +711,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1, /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
and the second polynom (Pi/4 <= x <= 0) in y2 */ and the second polynom (Pi/4 <= x <= 0) in y2 */
z = vmulq_f32(x,x); z = vmulq_f32(x, x);
y1 = vmulq_n_f32(z, c_coscof_p0); y1 = vmulq_n_f32(z, c_coscof_p0);
y2 = vmulq_n_f32(z, c_sincof_p0); y2 = vmulq_n_f32(z, c_sincof_p0);
@ -706,10 +740,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
aPtr += 4; aPtr += 4;
} }
for(number = neon_iters * 4; number < num_points; number++) for (number = neon_iters * 4; number < num_points; number++)
{ {
_in = *aPtr++; _in = *aPtr++;
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) ); *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
} }
} }

View File

@ -110,7 +110,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -124,7 +125,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -145,18 +146,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -180,7 +181,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -194,7 +196,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -215,18 +217,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -248,7 +250,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -262,7 +265,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -280,18 +283,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -314,7 +317,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -328,7 +332,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -346,18 +350,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -380,7 +384,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -395,7 +400,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -413,13 +418,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -429,12 +434,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -457,7 +462,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -472,7 +478,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -490,13 +496,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -506,12 +512,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -536,19 +542,21 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_; int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0); const int32x4_t zeros = vdupq_n_s32(0);
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips); const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i; int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx; uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec); float32x4_t n0 = vld1q_f32((float*)vec);
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
@ -556,7 +564,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < neon_iters; n++) for (n = 0; n < neon_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
@ -572,7 +580,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
// fmod // fmod
c = vmulq_f32(aux, reciprocal); c = vmulq_f32(aux, reciprocal);
i = vcvtq_s32_f32(c); i = vcvtq_s32_f32(c);
cTrunc = vcvtq_f32_s32(i); cTrunc = vcvtq_f32_s32(i);
base = vmulq_f32(cTrunc, code_length_chips_reg_f); base = vmulq_f32(cTrunc, code_length_chips_reg_f);
aux = vsubq_f32(aux, base); aux = vsubq_f32(aux, base);
@ -584,13 +592,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = vaddq_f32(indexn, fours); indexn = vaddq_f32(indexn, fours);
} }
for(n = neon_iters * 4; n < num_points; n++) for (n = neon_iters * 4; n < num_points; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap // resample code for current tap
@ -606,5 +614,3 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
#endif #endif
#endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/

View File

@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
unsigned int n; unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0,0); result[n_vec] = lv_cmake(0, 0);
} }
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
// Regenerate phase // Regenerate phase
if (n % 256 == 0) if (n % 256 == 0)
@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
unsigned int j; unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0,0); result[n_vec] = lv_cmake(0, 0);
} }
for (n = 0; n < num_points / ROTATOR_RELOAD; n++) for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
result[n_vec] += tmp32_2; result[n_vec] += tmp32_2;
} }
} }
/* Regenerate phase */ /* Regenerate phase */
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
@ -175,8 +175,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
const unsigned int sixteenthPoints = num_points / 16; const unsigned int sixteenthPoints = num_points / 16;
const float* aPtr = (float*)in_common; const float* aPtr = (float*)in_common;
const float* bPtr[ num_a_vectors]; const float* bPtr[num_a_vectors];
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
bPtr[vec_ind] = in_a[vec_ind]; bPtr[vec_ind] = in_a[vec_ind];
} }
@ -194,7 +194,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
__m256 dotProdVal2[num_a_vectors]; __m256 dotProdVal2[num_a_vectors];
__m256 dotProdVal3[num_a_vectors]; __m256 dotProdVal3[num_a_vectors];
for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ ) for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++)
{ {
dotProdVal0[vec_ind] = _mm256_setzero_ps(); dotProdVal0[vec_ind] = _mm256_setzero_ps();
dotProdVal1[vec_ind] = _mm256_setzero_ps(); dotProdVal1[vec_ind] = _mm256_setzero_ps();
@ -204,57 +204,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
// Set up the complex rotator // Set up the complex rotator
__m256 z0, z1, z2, z3; __m256 z0, z1, z2, z3;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16]; __VOLK_ATTR_ALIGNED(32)
for( vec_ind = 0; vec_ind < 16; ++vec_ind ) lv_32fc_t phase_vec[16];
for (vec_ind = 0; vec_ind < 16; ++vec_ind)
{ {
phase_vec[vec_ind] = _phase; phase_vec[vec_ind] = _phase;
_phase *= phase_inc; _phase *= phase_inc;
} }
z0 = _mm256_load_ps( (float *)phase_vec ); z0 = _mm256_load_ps((float*)phase_vec);
z1 = _mm256_load_ps( (float *)(phase_vec + 4) ); z1 = _mm256_load_ps((float*)(phase_vec + 4));
z2 = _mm256_load_ps( (float *)(phase_vec + 8) ); z2 = _mm256_load_ps((float*)(phase_vec + 8));
z3 = _mm256_load_ps( (float *)(phase_vec + 12) ); z3 = _mm256_load_ps((float*)(phase_vec + 12));
lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; lv_32fc_t dz = phase_inc;
dz *= dz;
dz *= dz;
dz *= dz;
dz *= dz; // dz = phase_inc^16;
for( vec_ind = 0; vec_ind < 4; ++vec_ind ) for (vec_ind = 0; vec_ind < 4; ++vec_ind)
{ {
phase_vec[vec_ind] = dz; phase_vec[vec_ind] = dz;
} }
__m256 dz_reg = _mm256_load_ps( (float *)phase_vec ); __m256 dz_reg = _mm256_load_ps((float*)phase_vec);
dz_reg = _mm256_complexnormalise_ps( dz_reg ); dz_reg = _mm256_complexnormalise_ps(dz_reg);
for(;number < sixteenthPoints; number++) for (; number < sixteenthPoints; number++)
{ {
a0Val = _mm256_loadu_ps(aPtr); a0Val = _mm256_loadu_ps(aPtr);
a1Val = _mm256_loadu_ps(aPtr+8); a1Val = _mm256_loadu_ps(aPtr + 8);
a2Val = _mm256_loadu_ps(aPtr+16); a2Val = _mm256_loadu_ps(aPtr + 16);
a3Val = _mm256_loadu_ps(aPtr+24); a3Val = _mm256_loadu_ps(aPtr + 24);
a0Val = _mm256_complexmul_ps( a0Val, z0 ); a0Val = _mm256_complexmul_ps(a0Val, z0);
a1Val = _mm256_complexmul_ps( a1Val, z1 ); a1Val = _mm256_complexmul_ps(a1Val, z1);
a2Val = _mm256_complexmul_ps( a2Val, z2 ); a2Val = _mm256_complexmul_ps(a2Val, z2);
a3Val = _mm256_complexmul_ps( a3Val, z3 ); a3Val = _mm256_complexmul_ps(a3Val, z3);
z0 = _mm256_complexmul_ps( z0, dz_reg ); z0 = _mm256_complexmul_ps(z0, dz_reg);
z1 = _mm256_complexmul_ps( z1, dz_reg ); z1 = _mm256_complexmul_ps(z1, dz_reg);
z2 = _mm256_complexmul_ps( z2, dz_reg ); z2 = _mm256_complexmul_ps(z2, dz_reg);
z3 = _mm256_complexmul_ps( z3, dz_reg ); z3 = _mm256_complexmul_ps(z3, dz_reg);
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8); x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8);
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]); x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]); x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
// TODO: it may be possible to rearrange swizzling to better pipeline data // TODO: it may be possible to rearrange swizzling to better pipeline data
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20); b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31); b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
@ -274,43 +279,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
// Force the rotators back onto the unit circle // Force the rotators back onto the unit circle
if ((number % 64) == 0) if ((number % 64) == 0)
{ {
z0 = _mm256_complexnormalise_ps( z0 ); z0 = _mm256_complexnormalise_ps(z0);
z1 = _mm256_complexnormalise_ps( z1 ); z1 = _mm256_complexnormalise_ps(z1);
z2 = _mm256_complexnormalise_ps( z2 ); z2 = _mm256_complexnormalise_ps(z2);
z3 = _mm256_complexnormalise_ps( z3 ); z3 = _mm256_complexnormalise_ps(z3);
} }
aPtr += 32; aPtr += 32;
} }
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
_mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
result[ vec_ind ] = lv_cmake( 0, 0 ); result[vec_ind] = lv_cmake(0, 0);
for( i = 0; i < 4; ++i ) for (i = 0; i < 4; ++i)
{ {
result[vec_ind] += dotProductVector[i]; result[vec_ind] += dotProductVector[i];
} }
} }
z0 = _mm256_complexnormalise_ps( z0 ); z0 = _mm256_complexnormalise_ps(z0);
_mm256_store_ps((float*)phase_vec, z0); _mm256_store_ps((float*)phase_vec, z0);
_phase = phase_vec[0]; _phase = phase_vec[0];
_mm256_zeroupper(); _mm256_zeroupper();
number = sixteenthPoints*16; number = sixteenthPoints * 16;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
wo = (*aPtr++)*_phase; wo = (*aPtr++) * _phase;
_phase *= phase_inc; _phase *= phase_inc;
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
result[vec_ind] += wo * in_a[vec_ind][number]; result[vec_ind] += wo * in_a[vec_ind][number];
} }
@ -333,8 +339,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
const unsigned int sixteenthPoints = num_points / 16; const unsigned int sixteenthPoints = num_points / 16;
const float* aPtr = (float*)in_common; const float* aPtr = (float*)in_common;
const float* bPtr[ num_a_vectors]; const float* bPtr[num_a_vectors];
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
bPtr[vec_ind] = in_a[vec_ind]; bPtr[vec_ind] = in_a[vec_ind];
} }
@ -352,7 +358,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
__m256 dotProdVal2[num_a_vectors]; __m256 dotProdVal2[num_a_vectors];
__m256 dotProdVal3[num_a_vectors]; __m256 dotProdVal3[num_a_vectors];
for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ ) for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++)
{ {
dotProdVal0[vec_ind] = _mm256_setzero_ps(); dotProdVal0[vec_ind] = _mm256_setzero_ps();
dotProdVal1[vec_ind] = _mm256_setzero_ps(); dotProdVal1[vec_ind] = _mm256_setzero_ps();
@ -362,58 +368,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
// Set up the complex rotator // Set up the complex rotator
__m256 z0, z1, z2, z3; __m256 z0, z1, z2, z3;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16]; __VOLK_ATTR_ALIGNED(32)
for( vec_ind = 0; vec_ind < 16; ++vec_ind ) lv_32fc_t phase_vec[16];
for (vec_ind = 0; vec_ind < 16; ++vec_ind)
{ {
phase_vec[vec_ind] = _phase; phase_vec[vec_ind] = _phase;
_phase *= phase_inc; _phase *= phase_inc;
} }
z0 = _mm256_load_ps( (float *)phase_vec ); z0 = _mm256_load_ps((float*)phase_vec);
z1 = _mm256_load_ps( (float *)(phase_vec + 4) ); z1 = _mm256_load_ps((float*)(phase_vec + 4));
z2 = _mm256_load_ps( (float *)(phase_vec + 8) ); z2 = _mm256_load_ps((float*)(phase_vec + 8));
z3 = _mm256_load_ps( (float *)(phase_vec + 12) ); z3 = _mm256_load_ps((float*)(phase_vec + 12));
lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; lv_32fc_t dz = phase_inc;
dz *= dz;
dz *= dz;
dz *= dz;
dz *= dz; // dz = phase_inc^16;
for( vec_ind = 0; vec_ind < 4; ++vec_ind ) for (vec_ind = 0; vec_ind < 4; ++vec_ind)
{ {
phase_vec[vec_ind] = dz; phase_vec[vec_ind] = dz;
} }
__m256 dz_reg = _mm256_load_ps( (float *)phase_vec ); __m256 dz_reg = _mm256_load_ps((float*)phase_vec);
dz_reg = _mm256_complexnormalise_ps( dz_reg ); dz_reg = _mm256_complexnormalise_ps(dz_reg);
for(;number < sixteenthPoints; number++) for (; number < sixteenthPoints; number++)
{ {
a0Val = _mm256_load_ps(aPtr); a0Val = _mm256_load_ps(aPtr);
a1Val = _mm256_load_ps(aPtr+8); a1Val = _mm256_load_ps(aPtr + 8);
a2Val = _mm256_load_ps(aPtr+16); a2Val = _mm256_load_ps(aPtr + 16);
a3Val = _mm256_load_ps(aPtr+24); a3Val = _mm256_load_ps(aPtr + 24);
a0Val = _mm256_complexmul_ps( a0Val, z0 ); a0Val = _mm256_complexmul_ps(a0Val, z0);
a1Val = _mm256_complexmul_ps( a1Val, z1 ); a1Val = _mm256_complexmul_ps(a1Val, z1);
a2Val = _mm256_complexmul_ps( a2Val, z2 ); a2Val = _mm256_complexmul_ps(a2Val, z2);
a3Val = _mm256_complexmul_ps( a3Val, z3 ); a3Val = _mm256_complexmul_ps(a3Val, z3);
z0 = _mm256_complexmul_ps( z0, dz_reg ); z0 = _mm256_complexmul_ps(z0, dz_reg);
z1 = _mm256_complexmul_ps( z1, dz_reg ); z1 = _mm256_complexmul_ps(z1, dz_reg);
z2 = _mm256_complexmul_ps( z2, dz_reg ); z2 = _mm256_complexmul_ps(z2, dz_reg);
z3 = _mm256_complexmul_ps( z3, dz_reg ); z3 = _mm256_complexmul_ps(z3, dz_reg);
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7 x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8); x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8);
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5 x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7 x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]); x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]); x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
// TODO: it may be possible to rearrange swizzling to better pipeline data // TODO: it may be possible to rearrange swizzling to better pipeline data
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3 b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7 b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20); b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31); b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
@ -433,43 +443,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
// Force the rotators back onto the unit circle // Force the rotators back onto the unit circle
if ((number % 64) == 0) if ((number % 64) == 0)
{ {
z0 = _mm256_complexnormalise_ps( z0 ); z0 = _mm256_complexnormalise_ps(z0);
z1 = _mm256_complexnormalise_ps( z1 ); z1 = _mm256_complexnormalise_ps(z1);
z2 = _mm256_complexnormalise_ps( z2 ); z2 = _mm256_complexnormalise_ps(z2);
z3 = _mm256_complexnormalise_ps( z3 ); z3 = _mm256_complexnormalise_ps(z3);
} }
aPtr += 32; aPtr += 32;
} }
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]); dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
_mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
result[ vec_ind ] = lv_cmake( 0, 0 ); result[vec_ind] = lv_cmake(0, 0);
for( i = 0; i < 4; ++i ) for (i = 0; i < 4; ++i)
{ {
result[vec_ind] += dotProductVector[i]; result[vec_ind] += dotProductVector[i];
} }
} }
z0 = _mm256_complexnormalise_ps( z0 ); z0 = _mm256_complexnormalise_ps(z0);
_mm256_store_ps((float*)phase_vec, z0); _mm256_store_ps((float*)phase_vec, z0);
_phase = phase_vec[0]; _phase = phase_vec[0];
_mm256_zeroupper(); _mm256_zeroupper();
number = sixteenthPoints*16; number = sixteenthPoints * 16;
for(;number < num_points; number++) for (; number < num_points; number++)
{ {
wo = (*aPtr++)*_phase; wo = (*aPtr++) * _phase;
_phase *= phase_inc; _phase *= phase_inc;
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind ) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
result[vec_ind] += wo * in_a[vec_ind][number]; result[vec_ind] += wo * in_a[vec_ind][number];
} }
@ -482,5 +493,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
#endif /* LV_HAVE_AVX */ #endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */

View File

@ -42,7 +42,7 @@
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -53,15 +53,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
} }
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -82,15 +82,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
} }
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re
#endif // Generic #endif // Generic
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -111,15 +111,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
} }
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -130,7 +130,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -141,15 +141,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad)); phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment()); float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points); memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
} }
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -159,4 +159,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
#endif // AVX #endif // AVX
#endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H #endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H

View File

@ -80,10 +80,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val); const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip // Clip
@ -99,12 +101,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(i = sse_iters * 8; i < num_points * 2; i++) for (i = sse_iters * 8; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++; aux = *inputVectorPtr++;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux); *outputVectorPtr++ = (int16_t)rintf(aux);
} }
@ -128,15 +130,17 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
const float max_val = (float)SHRT_MAX; const float max_val = (float)SHRT_MAX;
__m128 inputVal1, inputVal2; __m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h? __m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h?
__m128 ret1, ret2; __m128 ret1, ret2;
const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val); const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0;i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip // Clip
@ -152,12 +156,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(i = sse_iters * 8; i < num_points*2; i++) for (i = sse_iters * 8; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++; aux = *inputVectorPtr++;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux); *outputVectorPtr++ = (int16_t)rintf(aux);
} }
@ -175,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux; float aux;
unsigned int i; unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX; const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2; __m256 inputVal1, inputVal2;
@ -184,10 +188,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val); const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++) for (i = 0; i < avx2_iters; i++)
{ {
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip // Clip
@ -204,12 +210,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(i = avx2_iters * 16; i < num_points * 2; i++) for (i = avx2_iters * 16; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++; aux = *inputVectorPtr++;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux); *outputVectorPtr++ = (int16_t)rintf(aux);
} }
@ -238,10 +244,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val); const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip // Clip
@ -257,12 +265,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(i = sse_iters * 8; i < num_points * 2; i++) for (i = sse_iters * 8; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++; aux = *inputVectorPtr++;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux); *outputVectorPtr++ = (int16_t)rintf(aux);
} }
@ -289,10 +297,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val); const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip // Clip
@ -308,12 +318,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(i = sse_iters * 8; i < num_points * 2; i++) for (i = sse_iters * 8; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++; aux = *inputVectorPtr++;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux); *outputVectorPtr++ = (int16_t)rintf(aux);
} }
@ -332,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
int16_t* outputVectorPtr = (int16_t*)outputVector; int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux; float aux;
unsigned int i; unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX; const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2; __m256 inputVal1, inputVal2;
@ -341,10 +351,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val); const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++) for (i = 0; i < avx2_iters; i++)
{ {
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip // Clip
@ -361,12 +373,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(i = avx2_iters * 16; i < num_points * 2; i++) for (i = avx2_iters * 16; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++; aux = *inputVectorPtr++;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux); *outputVectorPtr++ = (int16_t)rintf(aux);
} }
@ -397,10 +409,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
int16x4_t intInputVal1, intInputVal2; int16x4_t intInputVal1, intInputVal2;
int16x8_t res; int16x8_t res;
for(i = 0; i < neon_iters; i++) for (i = 0; i < neon_iters; i++)
{ {
a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)(inputVectorPtr));
b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; inputVectorPtr += 4;
b = vld1q_f32((const float32_t*)(inputVectorPtr));
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
@ -425,12 +439,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
outputVectorPtr += 8; outputVectorPtr += 8;
} }
for(i = neon_iters * 8; i < num_points * 2; i++) for (i = neon_iters * 8; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++; aux = *inputVectorPtr++;
if(aux > max_val_f) if (aux > max_val_f)
aux = max_val_f; aux = max_val_f;
else if(aux < min_val_f) else if (aux < min_val_f)
aux = min_val_f; aux = min_val_f;
*outputVectorPtr++ = (int16_t)rintf(aux); *outputVectorPtr++ = (int16_t)rintf(aux);
} }
@ -449,14 +463,14 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
const float max_val = (float)SHRT_MAX; const float max_val = (float)SHRT_MAX;
float aux; float aux;
unsigned int i; unsigned int i;
for(i = 0; i < num_points * 2; i++) for (i = 0; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++; aux = *inputVectorPtr++;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux); *outputVectorPtr++ = (int16_t)rintf(aux);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */

View File

@ -72,12 +72,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
const float max_val = (float)SCHAR_MAX; const float max_val = (float)SCHAR_MAX;
float aux; float aux;
unsigned int i; unsigned int i;
for(i = 0; i < num_points * 2; i++) for (i = 0; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++ * max_val; aux = *inputVectorPtr++ * max_val;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux); *outputVectorPtr++ = (int8_t)rintf(aux);
} }
@ -107,12 +107,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector,
const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val); const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++) for (i = 0; i < avx2_iters; i++)
{ {
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
@ -142,12 +146,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector,
outputVectorPtr += 32; outputVectorPtr += 32;
} }
for(i = avx2_iters * 32; i < num_points * 2; i++) for (i = avx2_iters * 32; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++ * max_val; aux = *inputVectorPtr++ * max_val;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux); *outputVectorPtr++ = (int8_t)rintf(aux);
} }
@ -177,12 +181,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector,
const __m256 vmin_val = _mm256_set1_ps(min_val); const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val); const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++) for (i = 0; i < avx2_iters; i++)
{ {
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal3 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal4 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
@ -212,12 +220,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector,
outputVectorPtr += 32; outputVectorPtr += 32;
} }
for(i = avx2_iters * 32; i < num_points * 2; i++) for (i = avx2_iters * 32; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++ * max_val; aux = *inputVectorPtr++ * max_val;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux); *outputVectorPtr++ = (int8_t)rintf(aux);
} }
@ -247,12 +255,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val); const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, vmax_val); inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm_mul_ps(inputVal2, vmax_val); inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
@ -278,12 +290,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(i = sse_iters * 16; i < num_points * 2; i++) for (i = sse_iters * 16; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++ * max_val; aux = *inputVectorPtr++ * max_val;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux); *outputVectorPtr++ = (int8_t)rintf(aux);
} }
@ -313,12 +325,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val); const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val); const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, vmax_val); inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm_mul_ps(inputVal2, vmax_val); inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
@ -344,12 +360,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(i = sse_iters * 16; i < num_points * 2; i++) for (i = sse_iters * 16; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++ * max_val; aux = *inputVectorPtr++ * max_val;
if(aux > max_val) if (aux > max_val)
aux = max_val; aux = max_val;
else if(aux < min_val) else if (aux < min_val)
aux = min_val; aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux); *outputVectorPtr++ = (int8_t)rintf(aux);
} }
@ -383,9 +399,10 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
int8x8_t res8_1, res8_2; int8x8_t res8_1, res8_2;
int8x16_t outputVal; int8x16_t outputVal;
for(i = 0; i < neon_iters; i++) for (i = 0; i < neon_iters; i++)
{ {
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val); a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -394,7 +411,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
toint_a = vcvtq_s32_f32(Round); toint_a = vcvtq_s32_f32(Round);
intInputVal1 = vqmovn_s32(toint_a); intInputVal1 = vqmovn_s32(toint_a);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val); a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -406,7 +424,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2); pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2);
res8_1 = vqmovn_s16(pack16_8_1); res8_1 = vqmovn_s16(pack16_8_1);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val); a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -415,7 +434,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
toint_a = vcvtq_s32_f32(Round); toint_a = vcvtq_s32_f32(Round);
intInputVal1 = vqmovn_s32(toint_a); intInputVal1 = vqmovn_s32(toint_a);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val); a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -433,12 +453,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
outputVectorPtr += 16; outputVectorPtr += 16;
} }
for(i = neon_iters * 16; i < num_points * 2; i++) for (i = neon_iters * 16; i < num_points * 2; i++)
{ {
aux = *inputVectorPtr++ * max_val_f; aux = *inputVectorPtr++ * max_val_f;
if(aux > max_val_f) if (aux > max_val_f)
aux = max_val_f; aux = max_val_f;
else if(aux < min_val_f) else if (aux < min_val_f)
aux = min_val_f; aux = min_val_f;
*outputVectorPtr++ = (int8_t)rintf(aux); *outputVectorPtr++ = (int8_t)rintf(aux);
} }

View File

@ -42,31 +42,30 @@
#include <string.h> #include <string.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -78,26 +77,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -107,26 +106,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -137,26 +136,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -166,26 +165,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t*
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
@ -195,26 +194,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t*
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
@ -224,26 +223,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
@ -253,26 +252,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
@ -282,26 +281,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* re
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
@ -311,28 +310,28 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* re
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {
int code_length_chips = 2046; int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points ); float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3; int num_out_vectors = 3;
float rem_code_phase_chips = -0.234; float rem_code_phase_chips = -0.234;
unsigned int n; unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 }; float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
} }
volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points); volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++) for (n = 0; n < num_out_vectors; n++)
{ {
volk_gnsssdr_free(result_aux[n]); volk_gnsssdr_free(result_aux[n]);
} }
volk_gnsssdr_free(result_aux); volk_gnsssdr_free(result_aux);
} }
#endif #endif
#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H #endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H

View File

@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
unsigned int n; unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0,0); result[n_vec] = lv_cmake(0, 0);
} }
for (n = 0; n < num_points; n++) for (n = 0; n < num_points; n++)
{ {
tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase)); tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
// Regenerate phase // Regenerate phase
if (n % 256 == 0) if (n % 256 == 0)
@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
unsigned int j; unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
result[n_vec] = lv_cmake(0,0); result[n_vec] = lv_cmake(0, 0);
} }
for (n = 0; n < num_points / ROTATOR_RELOAD; n++) for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
result[n_vec] += tmp32_2; result[n_vec] += tmp32_2;
} }
} }
/* Regenerate phase */ /* Regenerate phase */
#ifdef __cplusplus #ifdef __cplusplus
(*phase) /= std::abs((*phase)); (*phase) /= std::abs((*phase));
#else #else
@ -169,7 +169,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
#include <pmmintrin.h> #include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_32fc_t dotProduct = lv_cmake(0,0); lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t tmp32_1, tmp32_2; lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2; const unsigned int sse_iters = num_points / 2;
int n_vec; int n_vec;
@ -179,7 +179,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
const lv_32fc_t** _in_a = in_a; const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t dotProductVector[2];
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
@ -191,11 +192,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
// phase rotation registers // phase rotation registers
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -203,12 +206,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg); const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg); const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
a = _mm_loadu_ps((float*)_in_common); a = _mm_loadu_ps((float*)_in_common);
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4); // __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); yh = _mm_movehdup_ps(two_phase_acc_reg);
tmp1 = _mm_mul_ps(a, yl); tmp1 = _mm_mul_ps(a, yl);
tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp); tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp);
@ -219,7 +222,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
z1 = _mm_addsub_ps(tmp1, tmp2); z1 = _mm_addsub_ps(tmp1, tmp2);
two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p); two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p);
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(z1); yh = _mm_movehdup_ps(z1);
//next two samples //next two samples
@ -227,7 +230,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_loadu_ps((float*)&(_in_a[n_vec][number*2])); a = _mm_loadu_ps((float*)&(_in_a[n_vec][number * 2]));
tmp1 = _mm_mul_ps(a, yl); tmp1 = _mm_mul_ps(a, yl);
a = _mm_shuffle_ps(a, a, 0xB1); a = _mm_shuffle_ps(a, a, 0xB1);
tmp2 = _mm_mul_ps(a, yh); tmp2 = _mm_mul_ps(a, yh);
@ -247,8 +250,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (i = 0; i < 2; ++i) for (i = 0; i < 2; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -260,7 +263,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0]; (*phase) = two_phase_acc[0];
for(n = sse_iters * 2; n < num_points; n++) for (n = sse_iters * 2; n < num_points; n++)
{ {
tmp32_1 = in_common[n] * (*phase); tmp32_1 = in_common[n] * (*phase);
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -278,7 +281,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
#include <pmmintrin.h> #include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_32fc_t dotProduct = lv_cmake(0,0); lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t tmp32_1, tmp32_2; lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2; const unsigned int sse_iters = num_points / 2;
int n_vec; int n_vec;
@ -288,7 +291,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
const lv_32fc_t** _in_a = in_a; const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t dotProductVector[2];
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
@ -300,11 +304,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
// phase rotation registers // phase rotation registers
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -312,12 +318,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg); const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg); const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
a = _mm_load_ps((float*)_in_common); a = _mm_load_ps((float*)_in_common);
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4); // __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); yh = _mm_movehdup_ps(two_phase_acc_reg);
tmp1 = _mm_mul_ps(a, yl); tmp1 = _mm_mul_ps(a, yl);
tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp); tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp);
@ -328,7 +334,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
z1 = _mm_addsub_ps(tmp1, tmp2); z1 = _mm_addsub_ps(tmp1, tmp2);
two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p); two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p);
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(z1); yh = _mm_movehdup_ps(z1);
//next two samples //next two samples
@ -336,7 +342,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = _mm_load_ps((float*)&(_in_a[n_vec][number*2])); a = _mm_load_ps((float*)&(_in_a[n_vec][number * 2]));
tmp1 = _mm_mul_ps(a, yl); tmp1 = _mm_mul_ps(a, yl);
a = _mm_shuffle_ps(a, a, 0xB1); a = _mm_shuffle_ps(a, a, 0xB1);
tmp2 = _mm_mul_ps(a, yh); tmp2 = _mm_mul_ps(a, yh);
@ -356,8 +362,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector _mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (i = 0; i < 2; ++i) for (i = 0; i < 2; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -369,7 +375,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg); _mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0]; (*phase) = two_phase_acc[0];
for(n = sse_iters * 2; n < num_points; n++) for (n = sse_iters * 2; n < num_points; n++)
{ {
tmp32_1 = in_common[n] * (*phase); tmp32_1 = in_common[n] * (*phase);
(*phase) *= phase_inc; (*phase) *= phase_inc;
@ -387,7 +393,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_32fc_t dotProduct = lv_cmake(0,0); lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t tmp32_1, tmp32_2; lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4; const unsigned int avx_iters = num_points / 4;
int n_vec; int n_vec;
@ -398,7 +404,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
lv_32fc_t _phase = (*phase); lv_32fc_t _phase = (*phase);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
@ -431,12 +438,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg); const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg); const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
for(number = 0; number < avx_iters; number++) for (number = 0; number < avx_iters; number++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
a = _mm256_loadu_ps((float*)_in_common); a = _mm256_loadu_ps((float*)_in_common);
__VOLK_GNSSSDR_PREFETCH(_in_common + 16); __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(four_phase_acc_reg); yh = _mm256_movehdup_ps(four_phase_acc_reg);
tmp1 = _mm256_mul_ps(a, yl); tmp1 = _mm256_mul_ps(a, yl);
tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp); tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp);
@ -447,7 +454,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
z = _mm256_addsub_ps(tmp1, tmp2); z = _mm256_addsub_ps(tmp1, tmp2);
four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p); four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p);
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(z); yh = _mm256_movehdup_ps(z);
//next two samples //next two samples
@ -475,8 +482,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -492,10 +499,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2);
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0]; _phase = four_phase_acc[0];
_mm256_zeroupper(); _mm256_zeroupper();
for(n = avx_iters * 4; n < num_points; n++) for (n = avx_iters * 4; n < num_points; n++)
{ {
tmp32_1 = *_in_common++ * _phase; tmp32_1 = *_in_common++ * _phase;
_phase *= phase_inc; _phase *= phase_inc;
@ -514,7 +521,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{ {
lv_32fc_t dotProduct = lv_cmake(0,0); lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t tmp32_1, tmp32_2; lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4; const unsigned int avx_iters = num_points / 4;
int n_vec; int n_vec;
@ -525,7 +532,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
lv_32fc_t _phase = (*phase); lv_32fc_t _phase = (*phase);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
@ -538,7 +546,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
// phase rotation registers // phase rotation registers
__m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z; __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_inc[4];
const lv_32fc_t phase_inc2 = phase_inc * phase_inc; const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc; const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc; const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
@ -548,7 +557,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
four_phase_inc[3] = phase_inc4; four_phase_inc[3] = phase_inc4;
const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_acc[4];
four_phase_acc[0] = _phase; four_phase_acc[0] = _phase;
four_phase_acc[1] = _phase * phase_inc; four_phase_acc[1] = _phase * phase_inc;
four_phase_acc[2] = _phase * phase_inc2; four_phase_acc[2] = _phase * phase_inc2;
@ -558,12 +568,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg); const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg); const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
for(number = 0; number < avx_iters; number++) for (number = 0; number < avx_iters; number++)
{ {
// Phase rotation on operand in_common starts here: // Phase rotation on operand in_common starts here:
a = _mm256_load_ps((float*)_in_common); a = _mm256_load_ps((float*)_in_common);
__VOLK_GNSSSDR_PREFETCH(_in_common + 16); __VOLK_GNSSSDR_PREFETCH(_in_common + 16);
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(four_phase_acc_reg); yh = _mm256_movehdup_ps(four_phase_acc_reg);
tmp1 = _mm256_mul_ps(a, yl); tmp1 = _mm256_mul_ps(a, yl);
tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp); tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp);
@ -574,7 +584,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
z = _mm256_addsub_ps(tmp1, tmp2); z = _mm256_addsub_ps(tmp1, tmp2);
four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p); four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p);
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(z); yh = _mm256_movehdup_ps(z);
//next two samples //next two samples
@ -602,8 +612,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector _mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -619,10 +629,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2); four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2);
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg); _mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0]; _phase = four_phase_acc[0];
_mm256_zeroupper(); _mm256_zeroupper();
for(n = avx_iters * 4; n < num_points; n++) for (n = avx_iters * 4; n < num_points; n++)
{ {
tmp32_1 = *_in_common++ * _phase; tmp32_1 = *_in_common++ * _phase;
_phase *= phase_inc; _phase *= phase_inc;
@ -646,7 +656,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
int n_vec; int n_vec;
int i; int i;
unsigned int number; unsigned int number;
unsigned int n ; unsigned int n;
const lv_32fc_t** _in_a = in_a; const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
lv_32fc_t* _out = result; lv_32fc_t* _out = result;
@ -656,36 +666,41 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
if (neon_iters > 0) if (neon_iters > 0)
{ {
lv_32fc_t dotProduct = lv_cmake(0,0); lv_32fc_t dotProduct = lv_cmake(0, 0);
float32_t arg_phase0 = cargf(_phase); float32_t arg_phase0 = cargf(_phase);
float32_t arg_phase_inc = cargf(phase_inc); float32_t arg_phase_inc = cargf(phase_inc);
float32_t phase_est; float32_t phase_est;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
lv_32fc_t phase2 = (lv_32fc_t)(_phase) * phase_inc; lv_32fc_t phase2 = (lv_32fc_t)(_phase)*phase_inc;
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag; float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag;
float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment()); float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment()); float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
for(n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f); accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f);
accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f); accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f);
@ -693,7 +708,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f); accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f);
} }
for(number = 0; number < neon_iters; number++) for (number = 0; number < neon_iters; number++)
{ {
/* load 4 complex numbers (float 32 bits each component) */ /* load 4 complex numbers (float 32 bits each component) */
b_val = vld2q_f32((float32_t*)_in_common); b_val = vld2q_f32((float32_t*)_in_common);
@ -728,8 +743,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);
@ -753,8 +770,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
} }
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
for (i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
dotProduct = dotProduct + dotProductVector[i]; dotProduct = dotProduct + dotProductVector[i];
@ -770,7 +787,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
_phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]); _phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
} }
for(n = neon_iters * 4; n < num_points; n++) for (n = neon_iters * 4; n < num_points; n++)
{ {
tmp32_1 = in_common[n] * _phase; tmp32_1 = in_common[n] * _phase;
_phase *= phase_inc; _phase *= phase_inc;
@ -786,4 +803,3 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
#endif /* LV_HAVE_NEON */ #endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */

View File

@ -41,7 +41,7 @@
#include <string.h> #include <string.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
} }
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
} }
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -101,7 +101,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -113,14 +113,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
} }
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -131,7 +131,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -143,14 +143,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
} }
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -161,7 +161,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -173,14 +173,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
} }
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -203,14 +203,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
} }
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }
@ -221,7 +221,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points) static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{ {
// phases must be normalized. Phase rotator expects a complex exponential input! // phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25; float rem_carrier_phase_in_rad = 0.25;
@ -233,14 +233,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32f
unsigned int n; unsigned int n;
int num_a_vectors = 3; int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment()); lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment()); in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points); memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
} }
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points); volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++) for (n = 0; n < num_a_vectors; n++)
{ {
volk_gnsssdr_free(in_a[n]); volk_gnsssdr_free(in_a[n]);
} }

View File

@ -107,7 +107,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -121,7 +122,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -142,18 +143,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -177,7 +178,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -191,7 +193,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -212,18 +214,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -245,7 +247,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -259,7 +262,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -277,18 +280,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -311,7 +314,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -325,7 +329,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f); __m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++) for (n = 0; n < quarterPoints; n++)
{ {
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn); aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2); aux = _mm_add_ps(aux, aux2);
@ -343,18 +347,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives); aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i); local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg); _mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = _mm_add_ps(indexn, fours); indexn = _mm_add_ps(indexn, fours);
} }
for(n = quarterPoints * 4; n < num_points; n++) for (n = quarterPoints * 4; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -377,7 +381,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -392,7 +397,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -410,13 +415,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -426,12 +431,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -454,7 +459,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -469,7 +475,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -487,13 +493,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -503,12 +509,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -531,7 +537,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -546,7 +553,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -565,13 +572,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -581,12 +588,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -609,7 +616,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -624,7 +632,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < avx_iters; n++) for (n = 0; n < avx_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -643,13 +651,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
// no negatives // no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg); c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 ); negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives); aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3); aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux); local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg); _mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k) for (k = 0; k < 8; ++k)
{ {
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
} }
@ -659,12 +667,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
_mm256_zeroupper(); _mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{ {
for(n = avx_iters * 8; n < num_points; n++) for (n = avx_iters * 8; n < num_points; n++)
{ {
// resample code for current tap // resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips); local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative! //Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ; if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips; local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_]; _result[current_correlator_tap][n] = local_code[local_code_chip_index_];
} }
@ -689,19 +697,21 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_; int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0); const int32x4_t zeros = vdupq_n_s32(0);
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips); const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i; int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx; uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required! reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec); float32x4_t n0 = vld1q_f32((float*)vec);
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++) for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
@ -709,7 +719,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]); shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg); aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0; indexn = n0;
for(n = 0; n < neon_iters; n++) for (n = 0; n < neon_iters; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]); __VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
@ -725,7 +735,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
// fmod // fmod
c = vmulq_f32(aux, reciprocal); c = vmulq_f32(aux, reciprocal);
i = vcvtq_s32_f32(c); i = vcvtq_s32_f32(c);
cTrunc = vcvtq_f32_s32(i); cTrunc = vcvtq_f32_s32(i);
base = vmulq_f32(cTrunc, code_length_chips_reg_f); base = vmulq_f32(cTrunc, code_length_chips_reg_f);
aux = vsubq_f32(aux, base); aux = vsubq_f32(aux, base);
@ -737,13 +747,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg); vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k) for (k = 0; k < 4; ++k)
{ {
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]]; _result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
} }
indexn = vaddq_f32(indexn, fours); indexn = vaddq_f32(indexn, fours);
} }
for(n = neon_iters * 4; n < num_points; n++) for (n = neon_iters * 4; n < num_points; n++)
{ {
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0); __VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap // resample code for current tap

View File

@ -69,11 +69,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
unsigned int i; unsigned int i;
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; __VOLK_ATTR_ALIGNED(32)
double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd(); __m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd(); __m256d aVal = _mm256_setzero_pd();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm256_loadu_pd(aPtr); aVal = _mm256_loadu_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal); accumulator = _mm256_add_pd(accumulator, aVal);
@ -82,12 +83,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
_mm256_storeu_pd((double*)tempBuffer, accumulator); _mm256_storeu_pd((double*)tempBuffer, accumulator);
for(i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(i = 0; i < (num_points % 4); ++i) for (i = 0; i < (num_points % 4); ++i)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }
@ -100,7 +101,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points) static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result, const double* inputBuffer, unsigned int num_points)
{ {
double returnValue = 0; double returnValue = 0;
const unsigned int sse_iters = num_points / 2; const unsigned int sse_iters = num_points / 2;
@ -108,11 +109,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
unsigned int i; unsigned int i;
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; __VOLK_ATTR_ALIGNED(16)
double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd(); __m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd(); __m128d aVal = _mm_setzero_pd();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm_loadu_pd(aPtr); aVal = _mm_loadu_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal); accumulator = _mm_add_pd(accumulator, aVal);
@ -121,12 +123,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
_mm_storeu_pd((double*)tempBuffer, accumulator); _mm_storeu_pd((double*)tempBuffer, accumulator);
for(i = 0; i < 2; ++i) for (i = 0; i < 2; ++i)
{ {
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(i = 0; i < (num_points % 2); ++i) for (i = 0; i < (num_points % 2); ++i)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }
@ -138,13 +140,13 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points) static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result, const double* inputBuffer, unsigned int num_points)
{ {
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
double returnValue = 0; double returnValue = 0;
unsigned int number; unsigned int number;
for(number = 0; number < num_points; number++) for (number = 0; number < num_points; number++)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }
@ -156,7 +158,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const
#ifdef LV_HAVE_AVX #ifdef LV_HAVE_AVX
#include <immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points) static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result, const double* inputBuffer, unsigned int num_points)
{ {
double returnValue = 0; double returnValue = 0;
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
@ -164,11 +166,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
unsigned int i; unsigned int i;
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; __VOLK_ATTR_ALIGNED(32)
double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd(); __m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd(); __m256d aVal = _mm256_setzero_pd();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm256_load_pd(aPtr); aVal = _mm256_load_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal); accumulator = _mm256_add_pd(accumulator, aVal);
@ -177,12 +180,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
_mm256_store_pd((double*)tempBuffer, accumulator); _mm256_store_pd((double*)tempBuffer, accumulator);
for(i = 0; i < 4; ++i) for (i = 0; i < 4; ++i)
{ {
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(i = 0; i < (num_points % 4); ++i) for (i = 0; i < (num_points % 4); ++i)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }
@ -195,7 +198,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points) static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result, const double* inputBuffer, unsigned int num_points)
{ {
double returnValue = 0; double returnValue = 0;
const unsigned int sse_iters = num_points / 2; const unsigned int sse_iters = num_points / 2;
@ -203,11 +206,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
unsigned int i; unsigned int i;
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; __VOLK_ATTR_ALIGNED(16)
double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd(); __m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd(); __m128d aVal = _mm_setzero_pd();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm_load_pd(aPtr); aVal = _mm_load_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal); accumulator = _mm_add_pd(accumulator, aVal);
@ -216,12 +220,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
_mm_store_pd((double*)tempBuffer, accumulator); _mm_store_pd((double*)tempBuffer, accumulator);
for(i = 0; i < 2; ++i) for (i = 0; i < 2; ++i)
{ {
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(i = 0; i < (num_points % 2); ++i) for (i = 0; i < (num_points % 2); ++i)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }

View File

@ -70,11 +70,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
unsigned int i; unsigned int i;
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128(); __m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm_lddqu_si128((__m128i*)aPtr); aVal = _mm_lddqu_si128((__m128i*)aPtr);
accumulator = _mm_add_epi8(accumulator, aVal); accumulator = _mm_add_epi8(accumulator, aVal);
@ -82,12 +83,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
} }
_mm_storeu_si128((__m128i*)tempBuffer, accumulator); _mm_storeu_si128((__m128i*)tempBuffer, accumulator);
for(i = 0; i < 16; ++i) for (i = 0; i < 16; ++i)
{ {
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(i = 0; i < (num_points % 16); ++i) for (i = 0; i < (num_points % 16); ++i)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }
@ -104,7 +105,7 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
char returnValue = 0; char returnValue = 0;
unsigned int number; unsigned int number;
for(number = 0;number < num_points; number++) for (number = 0; number < num_points; number++)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }
@ -125,24 +126,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128(); __m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm_load_si128((__m128i*)aPtr); aVal = _mm_load_si128((__m128i*)aPtr);
accumulator = _mm_add_epi8(accumulator, aVal); accumulator = _mm_add_epi8(accumulator, aVal);
aPtr += 16; aPtr += 16;
} }
_mm_store_si128((__m128i*)tempBuffer,accumulator); _mm_store_si128((__m128i*)tempBuffer, accumulator);
for(i = 0; i < 16; ++i) for (i = 0; i < 16; ++i)
{ {
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(i = 0; i < (num_points % 16); ++i) for (i = 0; i < (num_points % 16); ++i)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }
@ -164,24 +166,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const ch
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256(); __m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256(); __m256i aVal = _mm256_setzero_si256();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm256_load_si256((__m256i*)aPtr); aVal = _mm256_load_si256((__m256i*)aPtr);
accumulator = _mm256_add_epi8(accumulator, aVal); accumulator = _mm256_add_epi8(accumulator, aVal);
aPtr += 32; aPtr += 32;
} }
_mm256_store_si256((__m256i*)tempBuffer,accumulator); _mm256_store_si256((__m256i*)tempBuffer, accumulator);
for(i = 0; i < 32; ++i) for (i = 0; i < 32; ++i)
{ {
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(i = 0; i < (num_points % 32); ++i) for (i = 0; i < (num_points % 32); ++i)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }
@ -202,11 +205,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch
unsigned int i; unsigned int i;
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256(); __m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256(); __m256i aVal = _mm256_setzero_si256();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm256_lddqu_si256((__m256i*)aPtr); aVal = _mm256_lddqu_si256((__m256i*)aPtr);
accumulator = _mm256_add_epi8(accumulator, aVal); accumulator = _mm256_add_epi8(accumulator, aVal);
@ -214,12 +218,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch
} }
_mm256_storeu_si256((__m256i*)tempBuffer, accumulator); _mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
for(i = 0; i < 32; ++i) for (i = 0; i < 32; ++i)
{ {
returnValue += tempBuffer[i]; returnValue += tempBuffer[i];
} }
for(i = 0; i < (num_points % 32); ++i) for (i = 0; i < (num_points % 32); ++i)
{ {
returnValue += (*aPtr++); returnValue += (*aPtr++);
} }

View File

@ -60,11 +60,11 @@
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
#include<immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int avx2_iters = num_points / 32; const unsigned int avx2_iters = num_points / 32;
unsigned int number; unsigned int number;
@ -74,14 +74,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned int mask; unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues; __m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max); maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx2_iters; number++) for (number = 0; number < avx2_iters; number++)
{ {
currentValues = _mm256_loadu_si256((__m256i*)inputPtr); currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues); compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
mask = _mm256_movemask_epi8(compareResults); mask = _mm256_movemask_epi8(compareResults);
@ -94,7 +95,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
{ {
if ((mask & 1) == 1) if ((mask & 1) == 1)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
index = inputPtr - basePtr + i; index = inputPtr - basePtr + i;
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
@ -108,9 +109,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
inputPtr += 32; inputPtr += 32;
} }
for(i = 0; i<(num_points % 32); ++i) for (i = 0; i < (num_points % 32); ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -128,7 +129,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 32; const unsigned int sse_iters = num_points / 32;
unsigned int number; unsigned int number;
@ -137,33 +138,34 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i ones, compareResults, currentValues; __m256i ones, compareResults, currentValues;
__m128i compareResultslo, compareResultshi, maxValues, lo, hi; __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
ones = _mm256_set1_epi8(0xFF); ones = _mm256_set1_epi8(0xFF);
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm256_lddqu_si256((__m256i*)inputPtr); currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
lo = _mm256_castsi256_si128(currentValues); lo = _mm256_castsi256_si128(currentValues);
hi = _mm256_extractf128_si256(currentValues,1); hi = _mm256_extractf128_si256(currentValues, 1);
compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
compareResultshi = _mm_cmpgt_epi8(maxValues, hi); compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h //compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1); compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
if (!_mm256_testc_si256(compareResults, ones)) if (!_mm256_testc_si256(compareResults, ones))
{ {
_mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues); _mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
for(i = 0; i < 32; i++) for (i = 0; i < 32; i++)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
index = inputPtr - basePtr + i; index = inputPtr - basePtr + i;
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
@ -175,9 +177,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
inputPtr += 32; inputPtr += 32;
} }
for(i = 0; i<(num_points % 32); ++i) for (i = 0; i < (num_points % 32); ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -195,7 +197,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
unsigned int number; unsigned int number;
@ -204,14 +206,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_lddqu_si128((__m128i*)inputPtr); currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues); compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -219,9 +222,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
{ {
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues); _mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
for(i = 0; i < 16; i++) for (i = 0; i < 16; i++)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
index = inputPtr - basePtr + i; index = inputPtr - basePtr + i;
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
@ -233,9 +236,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
inputPtr += 16; inputPtr += 16;
} }
for(i = 0; i<(num_points % 16); ++i) for (i = 0; i < (num_points % 16); ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -249,11 +252,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include<emmintrin.h> #include <emmintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
unsigned int number; unsigned int number;
@ -263,14 +266,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned short mask; unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_loadu_si128((__m128i*)inputPtr); currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues); compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults); mask = _mm_movemask_epi8(compareResults);
@ -283,7 +287,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
{ {
if ((mask & 1) == 1) if ((mask & 1) == 1)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
index = inputPtr - basePtr + i; index = inputPtr - basePtr + i;
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
@ -297,9 +301,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
inputPtr += 16; inputPtr += 16;
} }
for(i = 0; i<(num_points % 16); ++i) for (i = 0; i < (num_points % 16); ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -316,14 +320,14 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned int i; unsigned int i;
for(i = 1; i < num_points; ++i) for (i = 1; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -337,11 +341,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
#include<immintrin.h> #include <immintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int avx2_iters = num_points / 32; const unsigned int avx2_iters = num_points / 32;
unsigned int number; unsigned int number;
@ -351,14 +355,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned int mask; unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues; __m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max); maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx2_iters; number++) for (number = 0; number < avx2_iters; number++)
{ {
currentValues = _mm256_load_si256((__m256i*)inputPtr); currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues); compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
mask = _mm256_movemask_epi8(compareResults); mask = _mm256_movemask_epi8(compareResults);
@ -371,7 +376,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
{ {
if ((mask & 1) == 1) if ((mask & 1) == 1)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
index = inputPtr - basePtr + i; index = inputPtr - basePtr + i;
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
@ -385,9 +390,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
inputPtr += 32; inputPtr += 32;
} }
for(i = 0; i<(num_points % 32); ++i) for (i = 0; i < (num_points % 32); ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -405,7 +410,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 32; const unsigned int sse_iters = num_points / 32;
unsigned int number; unsigned int number;
@ -414,19 +419,20 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i ones, compareResults, currentValues; __m256i ones, compareResults, currentValues;
__m128i compareResultslo, compareResultshi, maxValues, lo, hi; __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
ones = _mm256_set1_epi8(0xFF); ones = _mm256_set1_epi8(0xFF);
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm256_load_si256((__m256i*)inputPtr); currentValues = _mm256_load_si256((__m256i*)inputPtr);
lo = _mm256_castsi256_si128(currentValues); lo = _mm256_castsi256_si128(currentValues);
hi = _mm256_extractf128_si256(currentValues,1); hi = _mm256_extractf128_si256(currentValues, 1);
compareResultslo = _mm_cmpgt_epi8(maxValues, lo); compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
compareResultshi = _mm_cmpgt_epi8(maxValues, hi); compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
@ -438,9 +444,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
{ {
_mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues); _mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
for(i = 0; i < 32; i++) for (i = 0; i < 32; i++)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
index = inputPtr - basePtr + i; index = inputPtr - basePtr + i;
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
@ -452,9 +458,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
inputPtr += 32; inputPtr += 32;
} }
for(i = 0; i<(num_points % 32); ++i) for (i = 0; i < (num_points % 32); ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -472,7 +478,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
unsigned int number; unsigned int number;
@ -481,14 +487,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_load_si128((__m128i*)inputPtr); currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues); compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -496,9 +503,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
{ {
_mm_store_si128((__m128i*)&currentValuesBuffer, currentValues); _mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
for(i = 0; i < 16; i++) for (i = 0; i < 16; i++)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
index = inputPtr - basePtr + i; index = inputPtr - basePtr + i;
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
@ -510,9 +517,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
inputPtr += 16; inputPtr += 16;
} }
for(i = 0; i<(num_points % 16); ++i) for (i = 0; i < (num_points % 16); ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];
@ -530,7 +537,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
unsigned int number; unsigned int number;
@ -540,14 +547,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned short mask; unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_load_si128((__m128i*)inputPtr); currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues); compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults); mask = _mm_movemask_epi8(compareResults);
@ -560,7 +568,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
{ {
if ((mask & 1) == 1) if ((mask & 1) == 1)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
index = inputPtr - basePtr + i; index = inputPtr - basePtr + i;
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
@ -574,9 +582,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
inputPtr += 16; inputPtr += 16;
} }
for(i = 0; i<(num_points % 16); ++i) for (i = 0; i < (num_points % 16); ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
index = i; index = i;
max = src0[i]; max = src0[i];

View File

@ -63,21 +63,22 @@
static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int avx_iters = num_points / 32; const unsigned int avx_iters = num_points / 32;
unsigned int number; unsigned int number;
unsigned int i; unsigned int i;
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues; __m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max); maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx_iters; number++) for (number = 0; number < avx_iters; number++)
{ {
currentValues = _mm256_loadu_si256((__m256i*)inputPtr); currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues); compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults; maxValues = compareResults;
inputPtr += 32; inputPtr += 32;
@ -85,17 +86,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0
_mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues); _mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
for(i = 0; i < 32; ++i) for (i = 0; i < 32; ++i)
{ {
if(maxValuesBuffer[i] > max) if (maxValuesBuffer[i] > max)
{ {
max = maxValuesBuffer[i]; max = maxValuesBuffer[i];
} }
} }
for(i = avx_iters * 32; i < num_points; ++i) for (i = avx_iters * 32; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
max = src0[i]; max = src0[i];
} }
@ -112,21 +113,22 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0
static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
unsigned int number; unsigned int number;
unsigned int i; unsigned int i;
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_loadu_si128((__m128i*)inputPtr); currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues); compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 16; inputPtr += 16;
@ -134,17 +136,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues); _mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
for(i = 0; i < 16; ++i) for (i = 0; i < 16; ++i)
{ {
if(maxValuesBuffer[i] > max) if (maxValuesBuffer[i] > max)
{ {
max = maxValuesBuffer[i]; max = maxValuesBuffer[i];
} }
} }
for(i = sse_iters * 16; i < num_points; ++i) for (i = sse_iters * 16; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
max = src0[i]; max = src0[i];
} }
@ -157,11 +159,11 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
#include<emmintrin.h> #include <emmintrin.h>
static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
unsigned int number; unsigned int number;
@ -169,14 +171,15 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned short mask; unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_loadu_si128((__m128i*)inputPtr); currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues); compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults); mask = _mm_movemask_epi8(compareResults);
@ -189,7 +192,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
{ {
if ((mask & 1) == 1) if ((mask & 1) == 1)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
} }
@ -202,9 +205,9 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
inputPtr += 16; inputPtr += 16;
} }
for(i = sse_iters * 16; i < num_points; ++i) for (i = sse_iters * 16; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
max = src0[i]; max = src0[i];
} }
@ -220,13 +223,13 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
char max = src0[0]; char max = src0[0];
unsigned int i; unsigned int i;
for(i = 1; i < num_points; ++i) for (i = 1; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
max = src0[i]; max = src0[i];
} }
@ -243,21 +246,22 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src
static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
unsigned int number; unsigned int number;
unsigned int i; unsigned int i;
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_load_si128((__m128i*)inputPtr); currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues); compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults); maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 16; inputPtr += 16;
@ -265,17 +269,17 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues); _mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
for(i = 0; i < 16; ++i) for (i = 0; i < 16; ++i)
{ {
if(maxValuesBuffer[i] > max) if (maxValuesBuffer[i] > max)
{ {
max = maxValuesBuffer[i]; max = maxValuesBuffer[i];
} }
} }
for(i = sse_iters * 16; i < num_points; ++i) for (i = sse_iters * 16; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
max = src0[i]; max = src0[i];
} }
@ -292,39 +296,40 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int avx_iters = num_points / 32; const unsigned int avx_iters = num_points / 32;
unsigned int number; unsigned int number;
unsigned int i; unsigned int i;
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues; __m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max); maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx_iters; number++) for (number = 0; number < avx_iters; number++)
{ {
currentValues = _mm256_load_si256((__m256i*)inputPtr); currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues); compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults); maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 32; inputPtr += 32;
} }
_mm256_store_si256((__m256i*)maxValuesBuffer, maxValues); _mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
for(i = 0; i < 32; ++i) for (i = 0; i < 32; ++i)
{ {
if(maxValuesBuffer[i] > max) if (maxValuesBuffer[i] > max)
{ {
max = maxValuesBuffer[i]; max = maxValuesBuffer[i];
} }
} }
for(i = avx_iters * 32; i < num_points; ++i) for (i = avx_iters * 32; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
max = src0[i]; max = src0[i];
} }
@ -341,7 +346,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0
static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0, unsigned int num_points) static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0, unsigned int num_points)
{ {
if(num_points > 0) if (num_points > 0)
{ {
const unsigned int sse_iters = num_points / 16; const unsigned int sse_iters = num_points / 16;
unsigned int number; unsigned int number;
@ -349,14 +354,15 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned short mask; unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
currentValues = _mm_load_si128((__m128i*)inputPtr); currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues); compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults); mask = _mm_movemask_epi8(compareResults);
@ -369,7 +375,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
{ {
if ((mask & 1) == 1) if ((mask & 1) == 1)
{ {
if(currentValuesBuffer[i] > max) if (currentValuesBuffer[i] > max)
{ {
max = currentValuesBuffer[i]; max = currentValuesBuffer[i];
} }
@ -382,9 +388,9 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
inputPtr += 16; inputPtr += 16;
} }
for(i = sse_iters * 16; i < num_points; ++i) for (i = sse_iters * 16; i < num_points; ++i)
{ {
if(src0[i] > max) if (src0[i] > max)
{ {
max = src0[i]; max = src0[i];
} }

View File

@ -72,21 +72,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
__m128i aVal, bVal, cVal; __m128i aVal, bVal, cVal;
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm_loadu_si128((__m128i*)aPtr); aVal = _mm_loadu_si128((__m128i*)aPtr);
bVal = _mm_loadu_si128((__m128i*)bPtr); bVal = _mm_loadu_si128((__m128i*)bPtr);
cVal = _mm_add_epi8(aVal, bVal); cVal = _mm_add_epi8(aVal, bVal);
_mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container _mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
aPtr += 16; aPtr += 16;
bPtr += 16; bPtr += 16;
cPtr += 16; cPtr += 16;
} }
for(i = sse_iters * 16; i < num_points; ++i) for (i = sse_iters * 16; i < num_points; ++i)
{ {
*cPtr++ = (*aPtr++) + (*bPtr++); *cPtr++ = (*aPtr++) + (*bPtr++);
} }
@ -108,21 +108,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* a
__m256i aVal, bVal, cVal; __m256i aVal, bVal, cVal;
for(number = 0; number < avx_iters; number++) for (number = 0; number < avx_iters; number++)
{ {
aVal = _mm256_loadu_si256((__m256i*)aPtr); aVal = _mm256_loadu_si256((__m256i*)aPtr);
bVal = _mm256_loadu_si256((__m256i*)bPtr); bVal = _mm256_loadu_si256((__m256i*)bPtr);
cVal = _mm256_add_epi8(aVal, bVal); cVal = _mm256_add_epi8(aVal, bVal);
_mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container _mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
aPtr += 32; aPtr += 32;
bPtr += 32; bPtr += 32;
cPtr += 32; cPtr += 32;
} }
for(i = avx_iters * 32; i < num_points; ++i) for (i = avx_iters * 32; i < num_points; ++i)
{ {
*cPtr++ = (*aPtr++) + (*bPtr++); *cPtr++ = (*aPtr++) + (*bPtr++);
} }
@ -139,7 +139,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char*
const char* bPtr = bVector; const char* bPtr = bVector;
unsigned int number; unsigned int number;
for(number = 0; number < num_points; number++) for (number = 0; number < num_points; number++)
{ {
*cPtr++ = (*aPtr++) + (*bPtr++); *cPtr++ = (*aPtr++) + (*bPtr++);
} }
@ -161,21 +161,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
__m128i aVal, bVal, cVal; __m128i aVal, bVal, cVal;
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
aVal = _mm_load_si128((__m128i*)aPtr); aVal = _mm_load_si128((__m128i*)aPtr);
bVal = _mm_load_si128((__m128i*)bPtr); bVal = _mm_load_si128((__m128i*)bPtr);
cVal = _mm_add_epi8(aVal, bVal); cVal = _mm_add_epi8(aVal, bVal);
_mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container _mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
aPtr += 16; aPtr += 16;
bPtr += 16; bPtr += 16;
cPtr += 16; cPtr += 16;
} }
for(i = sse_iters * 16; i < num_points; ++i) for (i = sse_iters * 16; i < num_points; ++i)
{ {
*cPtr++ = (*aPtr++) + (*bPtr++); *cPtr++ = (*aPtr++) + (*bPtr++);
} }
@ -197,21 +197,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* a
__m256i aVal, bVal, cVal; __m256i aVal, bVal, cVal;
for(number = 0; number < avx_iters; number++) for (number = 0; number < avx_iters; number++)
{ {
aVal = _mm256_load_si256((__m256i*)aPtr); aVal = _mm256_load_si256((__m256i*)aPtr);
bVal = _mm256_load_si256((__m256i*)bPtr); bVal = _mm256_load_si256((__m256i*)bPtr);
cVal = _mm256_add_epi8(aVal, bVal); cVal = _mm256_add_epi8(aVal, bVal);
_mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container _mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
aPtr += 32; aPtr += 32;
bPtr += 32; bPtr += 32;
cPtr += 32; cPtr += 32;
} }
for(i = avx_iters * 32; i < num_points; ++i) for (i = avx_iters * 32; i < num_points; ++i)
{ {
*cPtr++ = (*aPtr++) + (*bPtr++); *cPtr++ = (*aPtr++) + (*bPtr++);
} }

View File

@ -111,10 +111,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
tmp = _mm256_xor_ps(tmp, conjugator1); tmp = _mm256_xor_ps(tmp, conjugator1);
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
_mm256_storeu_ps((float*)c, tmp); _mm256_storeu_ps((float*)c, tmp);
a += 16; a += 16;
@ -155,7 +155,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSSE3 */ #endif /* LV_HAVE_SSSE3 */
@ -188,7 +187,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -201,7 +199,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con
const lv_8sc_t* aPtr = aVector; const lv_8sc_t* aPtr = aVector;
unsigned int number; unsigned int number;
for(number = 0; number < num_points; number++) for (number = 0; number < num_points; number++)
{ {
*cPtr++ = lv_conj(*aPtr++); *cPtr++ = lv_conj(*aPtr++);
} }
@ -230,10 +228,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
tmp = _mm256_xor_ps(tmp, conjugator1); tmp = _mm256_xor_ps(tmp, conjugator1);
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp)); tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2); tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1); tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2); tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h //tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1)); tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
_mm256_store_ps((float*)c, tmp); _mm256_store_ps((float*)c, tmp);
a += 16; a += 16;
@ -336,7 +334,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */

View File

@ -78,23 +78,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr); avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero); avectorlo = _mm_unpacklo_epi8(avector, zero);
avectorhi = _mm_unpackhi_epi8 (avector, zero); avectorhi = _mm_unpackhi_epi8(avector, zero);
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi);
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); aadded = _mm_hadd_epi16(avectorlomult, avectorhimult);
complexVectorPtr += 16; complexVectorPtr += 16;
bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr); bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
bvectorlo = _mm_unpacklo_epi8 (bvector, zero); bvectorlo = _mm_unpacklo_epi8(bvector, zero);
bvectorhi = _mm_unpackhi_epi8 (bvector, zero); bvectorhi = _mm_unpackhi_epi8(bvector, zero);
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi);
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult);
complexVectorPtr += 16; complexVectorPtr += 16;
@ -162,11 +162,11 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude
const char* complexVectorPtr = (char*)complexVector; const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector; char* magnitudeVectorPtr = magnitudeVector;
unsigned int number; unsigned int number;
for(number = 0; number < num_points; number++) for (number = 0; number < num_points; number++)
{ {
const char real = *complexVectorPtr++; const char real = *complexVectorPtr++;
const char imag = *complexVectorPtr++; const char imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (real*real) + (imag*imag); *magnitudeVectorPtr++ = (real * real) + (imag * imag);
} }
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
@ -192,23 +192,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0); maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80); maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
avector = _mm_load_si128((__m128i*)complexVectorPtr); avector = _mm_load_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero); avectorlo = _mm_unpacklo_epi8(avector, zero);
avectorhi = _mm_unpackhi_epi8 (avector, zero); avectorhi = _mm_unpackhi_epi8(avector, zero);
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo); avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi); avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi);
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult); aadded = _mm_hadd_epi16(avectorlomult, avectorhimult);
complexVectorPtr += 16; complexVectorPtr += 16;
bvector = _mm_load_si128((__m128i*)complexVectorPtr); bvector = _mm_load_si128((__m128i*)complexVectorPtr);
bvectorlo = _mm_unpacklo_epi8 (bvector, zero); bvectorlo = _mm_unpacklo_epi8(bvector, zero);
bvectorhi = _mm_unpackhi_epi8 (bvector, zero); bvectorhi = _mm_unpackhi_epi8(bvector, zero);
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo); bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi); bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi);
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult); badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult);
complexVectorPtr += 16; complexVectorPtr += 16;

View File

@ -80,7 +80,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
imagy = _mm_and_si128(imagy, mult1); imagy = _mm_and_si128(imagy, mult1);
realy = _mm_and_si128(y, mult1); realy = _mm_and_si128(y, mult1);
for(; number < sse_iters; number++) for (; number < sse_iters; number++)
{ {
x = _mm_lddqu_si128((__m128i*)a); x = _mm_lddqu_si128((__m128i*)a);
@ -111,7 +111,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
{ {
*c++ = (*a++) * scalar; *c++ = (*a++) * scalar;
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -173,7 +172,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
imagy = _mm_and_si128(imagy, mult1); imagy = _mm_and_si128(imagy, mult1);
realy = _mm_and_si128(y, mult1); realy = _mm_and_si128(y, mult1);
for(; number < sse_iters; number++) for (; number < sse_iters; number++)
{ {
x = _mm_load_si128((__m128i*)a); x = _mm_load_si128((__m128i*)a);
@ -204,7 +203,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
{ {
*c++ = (*a++) * scalar; *c++ = (*a++) * scalar;
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */

View File

@ -75,17 +75,17 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
*cPtr += (*aPtr++) * (*bPtr++); *cPtr += (*aPtr++) * (*bPtr++);
}*/ }*/
char * res = (char*) result; char* res = (char*)result;
char * in = (char*) in_a; char* in = (char*)in_a;
char * tp = (char*) in_b; char* tp = (char*)in_b;
unsigned int n_2_ccomplex_blocks = num_points/2; unsigned int n_2_ccomplex_blocks = num_points / 2;
unsigned int isodd = num_points & 1; unsigned int isodd = num_points & 1;
char sum0[2] = {0,0}; char sum0[2] = {0, 0};
char sum1[2] = {0,0}; char sum1[2] = {0, 0};
unsigned int i = 0; unsigned int i = 0;
for(i = 0; i < n_2_ccomplex_blocks; ++i) for (i = 0; i < n_2_ccomplex_blocks; ++i)
{ {
sum0[0] += in[0] * tp[0] - in[1] * tp[1]; sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum0[1] += in[0] * tp[1] + in[1] * tp[0]; sum0[1] += in[0] * tp[1] + in[1] * tp[0];
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
res[1] = sum0[1] + sum1[1]; res[1] = sum0[1] + sum1[1];
// Cleanup if we had an odd number of points // Cleanup if we had an odd number of points
for(i = 0; i < isodd; ++i) for (i = 0; i < isodd; ++i)
{ {
*result += in_a[num_points - 1] * in_b[num_points - 1]; *result += in_a[num_points - 1] * in_b[num_points - 1];
} }
@ -115,13 +115,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{ {
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char)); memset(&dotProduct, 0x0, 2 * sizeof(char));
unsigned int number; unsigned int number;
unsigned int i; unsigned int i;
const lv_8sc_t* a = in_a; const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b; const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points / 8;
if (sse_iters > 0) if (sse_iters > 0)
{ {
@ -131,7 +131,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
realcacc = _mm_setzero_si128(); realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_loadu_si128((__m128i*)a); x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b); y = _mm_loadu_si128((__m128i*)b);
@ -165,9 +165,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
totalc = _mm_or_si128(realcacc, imagcacc); totalc = _mm_or_si128(realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (i = 0; i < 8; ++i) for (i = 0; i < 8; ++i)
{ {
@ -192,13 +193,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{ {
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char)); memset(&dotProduct, 0x0, 2 * sizeof(char));
unsigned int number; unsigned int number;
unsigned int i; unsigned int i;
const lv_8sc_t* a = in_a; const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b; const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points / 8;
if (sse_iters > 0) if (sse_iters > 0)
{ {
@ -208,7 +209,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
realcacc = _mm_setzero_si128(); realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_lddqu_si128((__m128i*)a); x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b); y = _mm_lddqu_si128((__m128i*)b);
@ -236,13 +237,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
b += 8; b += 8;
} }
imagcacc = _mm_slli_si128 (imagcacc, 1); imagcacc = _mm_slli_si128(imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (i = 0; i < 8; ++i) for (i = 0; i < 8; ++i)
{ {
@ -267,13 +269,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{ {
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char)); memset(&dotProduct, 0x0, 2 * sizeof(char));
unsigned int number; unsigned int number;
unsigned int i; unsigned int i;
const lv_8sc_t* a = in_a; const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b; const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8; const unsigned int sse_iters = num_points / 8;
if (sse_iters > 0) if (sse_iters > 0)
{ {
@ -283,7 +285,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
realcacc = _mm_setzero_si128(); realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_load_si128((__m128i*)a); x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b); y = _mm_load_si128((__m128i*)b);
@ -317,9 +319,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
totalc = _mm_or_si128(realcacc, imagcacc); totalc = _mm_or_si128(realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (i = 0; i < 8; ++i) for (i = 0; i < 8; ++i)
{ {
@ -343,7 +346,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{ {
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char)); memset(&dotProduct, 0x0, 2 * sizeof(char));
unsigned int number; unsigned int number;
unsigned int i; unsigned int i;
const lv_8sc_t* a = in_a; const lv_8sc_t* a = in_a;
@ -359,7 +362,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
realcacc = _mm_setzero_si128(); realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_load_si128((__m128i*)a); x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b); y = _mm_load_si128((__m128i*)b);
@ -387,13 +390,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
b += 8; b += 8;
} }
imagcacc = _mm_slli_si128 (imagcacc, 1); imagcacc = _mm_slli_si128(imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1); totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (i = 0; i < 8; ++i) for (i = 0; i < 8; ++i)
{ {
@ -438,22 +442,23 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, cons
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points) static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{ {
lv_8sc_t dotProduct; lv_8sc_t dotProduct;
dotProduct = lv_cmake(0,0); dotProduct = lv_cmake(0, 0);
*result = lv_cmake(0,0); *result = lv_cmake(0, 0);
const lv_8sc_t* a = in_a; const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b; const lv_8sc_t* b = in_b;
// for 2-lane vectors, 1st lane holds the real part, // for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part // 2nd lane holds the imaginary part
int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag; int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag;
__VOLK_ATTR_ALIGNED(16) lv_8sc_t accum_result[8] = { lv_cmake(0,0) }; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t accum_result[8] = {lv_cmake(0, 0)};
accumulator.val[0] = vdup_n_s8(0); accumulator.val[0] = vdup_n_s8(0);
accumulator.val[1] = vdup_n_s8(0); accumulator.val[1] = vdup_n_s8(0);
unsigned int number; unsigned int number;
const unsigned int neon_iters = num_points / 8; const unsigned int neon_iters = num_points / 8;
for(number = 0; number < neon_iters; ++number) for (number = 0; number < neon_iters; ++number)
{ {
a_val = vld2_s8((const int8_t*)a); a_val = vld2_s8((const int8_t*)a);
b_val = vld2_s8((const int8_t*)b); b_val = vld2_s8((const int8_t*)b);
@ -478,7 +483,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const
b += 8; b += 8;
} }
vst2_s8((int8_t*)accum_result, accumulator); vst2_s8((int8_t*)accum_result, accumulator);
for(number = 0; number < 8; ++number) for (number = 0; number < 8; ++number)
{ {
*result += accum_result[number]; *result += accum_result[number];
} }
@ -490,6 +495,6 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const
*result += dotProduct; *result += dotProduct;
} }
#endif /* LV_HAVE_NEON */ #endif /* LV_HAVE_NEON */
#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_H*/ #endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_H*/

View File

@ -75,7 +75,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_loadu_si128((__m128i*)a); x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b); y = _mm_loadu_si128((__m128i*)b);
@ -133,7 +133,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
_mm_setzero_si128(); _mm_setzero_si128();
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_lddqu_si128((__m128i*)a); x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b); y = _mm_lddqu_si128((__m128i*)b);
@ -181,7 +181,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c
const lv_8sc_t* bPtr = bVector; const lv_8sc_t* bPtr = bVector;
unsigned int number; unsigned int number;
for(number = 0; number < num_points; number++) for (number = 0; number < num_points; number++)
{ {
*cPtr++ = (*aPtr++) * (*bPtr++); *cPtr++ = (*aPtr++) * (*bPtr++);
} }
@ -204,7 +204,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_load_si128((__m128i*)a); x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b); y = _mm_load_si128((__m128i*)b);
@ -228,7 +228,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
imagc = _mm_and_si128(imagc, mult1); imagc = _mm_and_si128(imagc, mult1);
imagc = _mm_slli_si128(imagc, 1); imagc = _mm_slli_si128(imagc, 1);
totalc = _mm_or_si128 (realc, imagc); totalc = _mm_or_si128(realc, imagc);
_mm_store_si128((__m128i*)c, totalc); _mm_store_si128((__m128i*)c, totalc);
@ -262,7 +262,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
_mm_setzero_si128(); _mm_setzero_si128();
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF); mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_load_si128((__m128i*)a); x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b); y = _mm_load_si128((__m128i*)b);

View File

@ -72,7 +72,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c
const unsigned char* a = aChar; const unsigned char* a = aChar;
const unsigned char* b = bChar; const unsigned char* b = bChar;
for(number = 0; number < avx2_iters; number++) for (number = 0; number < avx2_iters; number++)
{ {
x = _mm256_loadu_si256((__m256i*)a); x = _mm256_loadu_si256((__m256i*)a);
y = _mm256_loadu_si256((__m256i*)b); y = _mm256_loadu_si256((__m256i*)b);
@ -101,7 +101,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c
c += 32; c += 32;
} }
for (i = avx2_iters * 32; i < num_points ; ++i) for (i = avx2_iters * 32; i < num_points; ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
@ -123,7 +123,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
const unsigned char* a = aChar; const unsigned char* a = aChar;
const unsigned char* b = bChar; const unsigned char* b = bChar;
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_lddqu_si128((__m128i*)a); x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b); y = _mm_lddqu_si128((__m128i*)b);
@ -152,7 +152,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
c += 16; c += 16;
} }
for (i = sse_iters * 16; i < num_points ; ++i) for (i = sse_iters * 16; i < num_points; ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }
@ -168,7 +168,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar,
const unsigned char* bPtr = bChar; const unsigned char* bPtr = bChar;
unsigned int number; unsigned int number;
for(number = 0; number < num_points; number++) for (number = 0; number < num_points; number++)
{ {
*cPtr++ = (*aPtr++) * (*bPtr++); *cPtr++ = (*aPtr++) * (*bPtr++);
} }
@ -189,7 +189,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
const unsigned char* a = aChar; const unsigned char* a = aChar;
const unsigned char* b = bChar; const unsigned char* b = bChar;
for(number = 0; number < sse_iters; number++) for (number = 0; number < sse_iters; number++)
{ {
x = _mm_load_si128((__m128i*)a); x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b); y = _mm_load_si128((__m128i*)b);
@ -240,7 +240,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c
const unsigned char* a = aChar; const unsigned char* a = aChar;
const unsigned char* b = bChar; const unsigned char* b = bChar;
for(number = 0; number < avx2_iters; number++) for (number = 0; number < avx2_iters; number++)
{ {
x = _mm256_load_si256((__m256i*)a); x = _mm256_load_si256((__m256i*)a);
y = _mm256_load_si256((__m256i*)b); y = _mm256_load_si256((__m256i*)b);
@ -269,7 +269,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c
c += 32; c += 32;
} }
for (i = avx2_iters * 32; i < num_points ; ++i) for (i = avx2_iters * 32; i < num_points; ++i)
{ {
*c++ = (*a++) * (*b++); *c++ = (*a++) * (*b++);
} }

View File

@ -71,9 +71,9 @@
#include <emmintrin.h> #include <emmintrin.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t* bPtr = out; lv_32fc_t *bPtr = out;
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
unsigned int number = 0; unsigned int number = 0;
@ -84,44 +84,44 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
__m128i emm0, emm2, emm4; __m128i emm0, emm2, emm4;
/* declare some SSE constants */ /* declare some SSE constants */
static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
static const int _pi32_1[4] = { 1, 1, 1, 1 }; static const int _pi32_1[4] = {1, 1, 1, 1};
static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
static const int _pi32_2[4] = { 2, 2, 2, 2}; static const int _pi32_2[4] = {2, 2, 2, 2};
static const int _pi32_4[4] = { 4, 4, 4, 4}; static const int _pi32_4[4] = {4, 4, 4, 4};
static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc }; float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc};
four_phases_reg = _mm_load_ps(four_phases); four_phases_reg = _mm_load_ps(four_phases);
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc); const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
for(;number < sse_iters; number++) for (; number < sse_iters; number++)
{ {
x = four_phases_reg; x = four_phases_reg;
sign_bit_sin = x; sign_bit_sin = x;
/* take the absolute value */ /* take the absolute value */
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */ /* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask);
/* scale by 4/Pi */ /* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI);
/* store the integer part of y in emm2 */ /* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y); emm2 = _mm_cvttps_epi32(y);
@ -145,9 +145,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
/* The magic pass: "Extended precision modular arithmetic” /* The magic pass: "Extended precision modular arithmetic”
x = ((x - y * DP1) - y * DP2) - y * DP3; */ x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm1 = *(__m128 *)_ps_minus_cephes_DP1;
xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm2 = *(__m128 *)_ps_minus_cephes_DP2;
xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm3 = *(__m128 *)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1); xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2); xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3); xmm3 = _mm_mul_ps(y, xmm3);
@ -163,25 +163,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x); __m128 z = _mm_mul_ps(x, x);
y = *(__m128*)_ps_coscof_p0; y = *(__m128 *)_ps_coscof_p0;
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1);
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2);
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); __m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5);
y = _mm_sub_ps(y, tmp); y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)_ps_1); y = _mm_add_ps(y, *(__m128 *)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)_ps_sincof_p0; __m128 y2 = *(__m128 *)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x); y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x); y2 = _mm_add_ps(y2, x);
@ -190,11 +190,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
xmm3 = poly_mask; xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y); __m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2); y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1); y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2); xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y,y2); xmm2 = _mm_add_ps(y, y2);
/* update the sign */ /* update the sign */
sine = _mm_xor_ps(xmm1, sign_bit_sin); sine = _mm_xor_ps(xmm1, sign_bit_sin);
@ -202,19 +202,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
/* write the output */ /* write the output */
aux = _mm_unpacklo_ps(cosine, sine); aux = _mm_unpacklo_ps(cosine, sine);
_mm_store_ps((float*)bPtr, aux); _mm_store_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
aux = _mm_unpackhi_ps(cosine, sine); aux = _mm_unpackhi_ps(cosine, sine);
_mm_store_ps((float*)bPtr, aux); _mm_store_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg); four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg);
} }
_phase = _phase + phase_inc * (sse_iters * 4); _phase = _phase + phase_inc * (sse_iters * 4);
for(number = sse_iters * 4; number < num_points; number++) for (number = sse_iters * 4; number < num_points; number++)
{ {
*bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase)) ); *bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase)));
_phase += phase_inc; _phase += phase_inc;
} }
(*phase) = _phase; (*phase) = _phase;
@ -227,9 +227,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
#include <emmintrin.h> #include <emmintrin.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */ /* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t* bPtr = out; lv_32fc_t *bPtr = out;
const unsigned int sse_iters = num_points / 4; const unsigned int sse_iters = num_points / 4;
unsigned int number = 0; unsigned int number = 0;
@ -241,44 +241,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
__m128i emm0, emm2, emm4; __m128i emm0, emm2, emm4;
/* declare some SSE constants */ /* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; __VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(16) float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc }; float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
__VOLK_ATTR_ALIGNED(16)
float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc};
four_phases_reg = _mm_load_ps(four_phases); four_phases_reg = _mm_load_ps(four_phases);
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc); const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
for(;number < sse_iters; number++) for (; number < sse_iters; number++)
{ {
x = four_phases_reg; x = four_phases_reg;
sign_bit_sin = x; sign_bit_sin = x;
/* take the absolute value */ /* take the absolute value */
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */ /* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask);
/* scale by 4/Pi */ /* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI);
/* store the integer part of y in emm2 */ /* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y); emm2 = _mm_cvttps_epi32(y);
@ -302,9 +322,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
/* The magic pass: "Extended precision modular arithmetic” /* The magic pass: "Extended precision modular arithmetic”
x = ((x - y * DP1) - y * DP2) - y * DP3; */ x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm1 = *(__m128 *)_ps_minus_cephes_DP1;
xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm2 = *(__m128 *)_ps_minus_cephes_DP2;
xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm3 = *(__m128 *)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1); xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2); xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3); xmm3 = _mm_mul_ps(y, xmm3);
@ -320,25 +340,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x); __m128 z = _mm_mul_ps(x, x);
y = *(__m128*)_ps_coscof_p0; y = *(__m128 *)_ps_coscof_p0;
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1);
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2);
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); __m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5);
y = _mm_sub_ps(y, tmp); y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)_ps_1); y = _mm_add_ps(y, *(__m128 *)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)_ps_sincof_p0; __m128 y2 = *(__m128 *)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x); y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x); y2 = _mm_add_ps(y2, x);
@ -347,11 +367,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
xmm3 = poly_mask; xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y); __m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2); y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1); y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2); xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y,y2); xmm2 = _mm_add_ps(y, y2);
/* update the sign */ /* update the sign */
sine = _mm_xor_ps(xmm1, sign_bit_sin); sine = _mm_xor_ps(xmm1, sign_bit_sin);
@ -359,19 +379,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
/* write the output */ /* write the output */
aux = _mm_unpacklo_ps(cosine, sine); aux = _mm_unpacklo_ps(cosine, sine);
_mm_storeu_ps((float*)bPtr, aux); _mm_storeu_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
aux = _mm_unpackhi_ps(cosine, sine); aux = _mm_unpackhi_ps(cosine, sine);
_mm_storeu_ps((float*)bPtr, aux); _mm_storeu_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg); four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg);
} }
_phase = _phase + phase_inc * (sse_iters * 4); _phase = _phase + phase_inc * (sse_iters * 4);
for(number = sse_iters * 4; number < num_points; number++) for (number = sse_iters * 4; number < num_points; number++)
{ {
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc; _phase += phase_inc;
} }
(*phase) = _phase; (*phase) = _phase;
@ -382,13 +402,13 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
float _phase = (*phase); float _phase = (*phase);
unsigned int i; unsigned int i;
for(i = 0; i < num_points; i++) for (i = 0; i < num_points; i++)
{ {
*out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); *out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc; _phase += phase_inc;
} }
(*phase) = _phase; (*phase) = _phase;
@ -400,7 +420,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const f
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
#include <volk_gnsssdr/volk_gnsssdr_sine_table.h> #include <volk_gnsssdr/volk_gnsssdr_sine_table.h>
#include <stdint.h> #include <stdint.h>
static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
float _in, s, c; float _in, s, c;
unsigned int i; unsigned int i;
@ -413,12 +433,12 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
const int32_t diffbits = bitlength - Nbits; const int32_t diffbits = bitlength - Nbits;
uint32_t ux; uint32_t ux;
float _phase = (*phase); float _phase = (*phase);
for(i = 0; i < num_points; i++) for (i = 0; i < num_points; i++)
{ {
_in = _phase; _in = _phase;
d = (int32_t)floor(_in / TWO_PI + 0.5); d = (int32_t)floor(_in / TWO_PI + 0.5);
_in -= d * TWO_PI; _in -= d * TWO_PI;
x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI); x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI);
ux = x; ux = x;
sin_index = ux >> diffbits; sin_index = ux >> diffbits;
@ -428,7 +448,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
cos_index = ux >> diffbits; cos_index = ux >> diffbits;
c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1]; c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1];
*out++ = lv_cmake((float)c, (float)s ); *out++ = lv_cmake((float)c, (float)s);
_phase += phase_inc; _phase += phase_inc;
} }
(*phase) = _phase; (*phase) = _phase;
@ -441,9 +461,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
#include <immintrin.h> #include <immintrin.h>
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t* bPtr = out; lv_32fc_t *bPtr = out;
const unsigned int avx_iters = num_points / 8; const unsigned int avx_iters = num_points / 8;
unsigned int number = 0; unsigned int number = 0;
@ -456,44 +476,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
__m128 aux, c1, s1; __m128 aux, c1, s1;
/* declare some AXX2 constants */ /* declare some AXX2 constants */
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(32)
static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 }; static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 }; __VOLK_ATTR_ALIGNED(32)
static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc};
__VOLK_ATTR_ALIGNED(32)
float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc};
eight_phases_reg = _mm256_load_ps(eight_phases); eight_phases_reg = _mm256_load_ps(eight_phases);
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
for(;number < avx_iters; number++) for (; number < avx_iters; number++)
{ {
x = eight_phases_reg; x = eight_phases_reg;
sign_bit_sin = x; sign_bit_sin = x;
/* take the absolute value */ /* take the absolute value */
x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask); x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */ /* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask); sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask);
/* scale by 4/Pi */ /* scale by 4/Pi */
y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI); y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI);
/* store the integer part of y in emm2 */ /* store the integer part of y in emm2 */
emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_cvttps_epi32(y);
@ -517,9 +557,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
/* The magic pass: "Extended precision modular arithmetic” /* The magic pass: "Extended precision modular arithmetic”
x = ((x - y * DP1) - y * DP2) - y * DP3; */ x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m256*)_ps_minus_cephes_DP1; xmm1 = *(__m256 *)_ps_minus_cephes_DP1;
xmm2 = *(__m256*)_ps_minus_cephes_DP2; xmm2 = *(__m256 *)_ps_minus_cephes_DP2;
xmm3 = *(__m256*)_ps_minus_cephes_DP3; xmm3 = *(__m256 *)_ps_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1); xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2); xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3); xmm3 = _mm256_mul_ps(y, xmm3);
@ -536,24 +576,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
__m256 z = _mm256_mul_ps(x, x); __m256 z = _mm256_mul_ps(x, x);
y = *(__m256*)_ps_coscof_p0; y = *(__m256 *)_ps_coscof_p0;
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1); y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2); y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
__m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5); __m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5);
y = _mm256_sub_ps(y, tmp); y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(__m256*)_ps_1); y = _mm256_add_ps(y, *(__m256 *)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m256 y2 = *(__m256*)_ps_sincof_p0; __m256 y2 = *(__m256 *)_ps_sincof_p0;
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1); y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1);
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2); y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2);
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x); y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x); y2 = _mm256_add_ps(y2, x);
@ -576,27 +616,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
s1 = _mm256_extractf128_ps(sine, 0); s1 = _mm256_extractf128_ps(sine, 0);
c1 = _mm256_extractf128_ps(cosine, 0); c1 = _mm256_extractf128_ps(cosine, 0);
aux = _mm_unpacklo_ps(c1, s1); aux = _mm_unpacklo_ps(c1, s1);
_mm_store_ps((float*)bPtr, aux); _mm_store_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
aux = _mm_unpackhi_ps(c1, s1); aux = _mm_unpackhi_ps(c1, s1);
_mm_store_ps((float*)bPtr, aux); _mm_store_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
s1 = _mm256_extractf128_ps(sine, 1); s1 = _mm256_extractf128_ps(sine, 1);
c1 = _mm256_extractf128_ps(cosine, 1); c1 = _mm256_extractf128_ps(cosine, 1);
aux = _mm_unpacklo_ps(c1, s1); aux = _mm_unpacklo_ps(c1, s1);
_mm_store_ps((float*)bPtr, aux); _mm_store_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
aux = _mm_unpackhi_ps(c1, s1); aux = _mm_unpackhi_ps(c1, s1);
_mm_store_ps((float*)bPtr, aux); _mm_store_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg); eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
} }
_mm256_zeroupper(); _mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8); _phase = _phase + phase_inc * (avx_iters * 8);
for(number = avx_iters * 8; number < num_points; number++) for (number = avx_iters * 8; number < num_points; number++)
{ {
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc; _phase += phase_inc;
} }
(*phase) = _phase; (*phase) = _phase;
@ -609,9 +649,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
#include <immintrin.h> #include <immintrin.h>
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/ * Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t* bPtr = out; lv_32fc_t *bPtr = out;
const unsigned int avx_iters = num_points / 8; const unsigned int avx_iters = num_points / 8;
unsigned int number = 0; unsigned int number = 0;
@ -624,44 +664,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
__m128 aux, c1, s1; __m128 aux, c1, s1;
/* declare some AXX2 constants */ /* declare some AXX2 constants */
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(32)
static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 }; static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 }; __VOLK_ATTR_ALIGNED(32)
static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc};
__VOLK_ATTR_ALIGNED(32)
float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc};
eight_phases_reg = _mm256_load_ps(eight_phases); eight_phases_reg = _mm256_load_ps(eight_phases);
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
for(;number < avx_iters; number++) for (; number < avx_iters; number++)
{ {
x = eight_phases_reg; x = eight_phases_reg;
sign_bit_sin = x; sign_bit_sin = x;
/* take the absolute value */ /* take the absolute value */
x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask); x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */ /* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask); sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask);
/* scale by 4/Pi */ /* scale by 4/Pi */
y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI); y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI);
/* store the integer part of y in emm2 */ /* store the integer part of y in emm2 */
emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_cvttps_epi32(y);
@ -685,9 +745,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
/* The magic pass: "Extended precision modular arithmetic” /* The magic pass: "Extended precision modular arithmetic”
x = ((x - y * DP1) - y * DP2) - y * DP3; */ x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m256*)_ps_minus_cephes_DP1; xmm1 = *(__m256 *)_ps_minus_cephes_DP1;
xmm2 = *(__m256*)_ps_minus_cephes_DP2; xmm2 = *(__m256 *)_ps_minus_cephes_DP2;
xmm3 = *(__m256*)_ps_minus_cephes_DP3; xmm3 = *(__m256 *)_ps_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1); xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2); xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3); xmm3 = _mm256_mul_ps(y, xmm3);
@ -704,24 +764,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
/* Evaluate the first polynom (0 <= x <= Pi/4) */ /* Evaluate the first polynom (0 <= x <= Pi/4) */
__m256 z = _mm256_mul_ps(x, x); __m256 z = _mm256_mul_ps(x, x);
y = *(__m256*)_ps_coscof_p0; y = *(__m256 *)_ps_coscof_p0;
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1); y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2); y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z);
__m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5); __m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5);
y = _mm256_sub_ps(y, tmp); y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(__m256*)_ps_1); y = _mm256_add_ps(y, *(__m256 *)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */ /* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m256 y2 = *(__m256*)_ps_sincof_p0; __m256 y2 = *(__m256 *)_ps_sincof_p0;
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1); y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1);
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2); y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2);
y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x); y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x); y2 = _mm256_add_ps(y2, x);
@ -744,27 +804,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
s1 = _mm256_extractf128_ps(sine, 0); s1 = _mm256_extractf128_ps(sine, 0);
c1 = _mm256_extractf128_ps(cosine, 0); c1 = _mm256_extractf128_ps(cosine, 0);
aux = _mm_unpacklo_ps(c1, s1); aux = _mm_unpacklo_ps(c1, s1);
_mm_storeu_ps((float*)bPtr, aux); _mm_storeu_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
aux = _mm_unpackhi_ps(c1, s1); aux = _mm_unpackhi_ps(c1, s1);
_mm_storeu_ps((float*)bPtr, aux); _mm_storeu_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
s1 = _mm256_extractf128_ps(sine, 1); s1 = _mm256_extractf128_ps(sine, 1);
c1 = _mm256_extractf128_ps(cosine, 1); c1 = _mm256_extractf128_ps(cosine, 1);
aux = _mm_unpacklo_ps(c1, s1); aux = _mm_unpacklo_ps(c1, s1);
_mm_storeu_ps((float*)bPtr, aux); _mm_storeu_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
aux = _mm_unpackhi_ps(c1, s1); aux = _mm_unpackhi_ps(c1, s1);
_mm_storeu_ps((float*)bPtr, aux); _mm_storeu_ps((float *)bPtr, aux);
bPtr += 2; bPtr += 2;
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg); eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
} }
_mm256_zeroupper(); _mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8); _phase = _phase + phase_inc * (avx_iters * 8);
for(number = avx_iters * 8; number < num_points; number++) for (number = avx_iters * 8; number < num_points; number++)
{ {
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc; _phase += phase_inc;
} }
(*phase) = _phase; (*phase) = _phase;
@ -777,15 +837,17 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
#include <arm_neon.h> #include <arm_neon.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */ /* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */ /* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points) static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{ {
lv_32fc_t* bPtr = out; lv_32fc_t *bPtr = out;
const unsigned int neon_iters = num_points / 4; const unsigned int neon_iters = num_points / 4;
float _phase = (*phase); float _phase = (*phase);
__VOLK_ATTR_ALIGNED(16) float32_t four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; __VOLK_ATTR_ALIGNED(16)
float32_t four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
float four_inc = 4 * phase_inc; float four_inc = 4 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t four_phases_inc[4] = { four_inc, four_inc, four_inc, four_inc }; __VOLK_ATTR_ALIGNED(16)
float32_t four_phases_inc[4] = {four_inc, four_inc, four_inc, four_inc};
float32x4_t four_phases_reg = vld1q_f32(four_phases); float32x4_t four_phases_reg = vld1q_f32(four_phases);
float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc); float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc);
@ -808,7 +870,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos; uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos;
for(;number < neon_iters; number++) for (; number < neon_iters; number++)
{ {
x = four_phases_reg; x = four_phases_reg;
@ -847,7 +909,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1, /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
and the second polynom (Pi/4 <= x <= 0) in y2 */ and the second polynom (Pi/4 <= x <= 0) in y2 */
z = vmulq_f32(x,x); z = vmulq_f32(x, x);
y1 = vmulq_n_f32(z, c_coscof_p0); y1 = vmulq_n_f32(z, c_coscof_p0);
y2 = vmulq_n_f32(z, c_sincof_p0); y2 = vmulq_n_f32(z, c_sincof_p0);
@ -871,16 +933,16 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
result.val[1] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys); result.val[1] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
result.val[0] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc)); result.val[0] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
vst2q_f32((float32_t*)bPtr, result); vst2q_f32((float32_t *)bPtr, result);
bPtr += 4; bPtr += 4;
four_phases_reg = vaddq_f32(four_phases_reg, four_phases_inc_reg); four_phases_reg = vaddq_f32(four_phases_reg, four_phases_inc_reg);
} }
_phase = _phase + phase_inc * (neon_iters * 4); _phase = _phase + phase_inc * (neon_iters * 4);
for(number = neon_iters * 4; number < num_points; number++) for (number = neon_iters * 4; number < num_points; number++)
{ {
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) ); *bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc; _phase += phase_inc;
} }
(*phase) = _phase; (*phase) = _phase;

View File

@ -49,7 +49,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic(lv_32fc_t* out, c
volk_gnsssdr_s32f_sincos_32fc_generic(out, phase_inc, phase, num_points); volk_gnsssdr_s32f_sincos_32fc_generic(out, phase_inc, phase, num_points);
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
@ -60,7 +60,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic_fxpt(lv_32fc_t* o
volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(out, phase_inc, phase, num_points); volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(out, phase_inc, phase, num_points);
} }
#endif /* LV_HAVE_GENERIC */ #endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
@ -70,7 +70,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_sse2(lv_32fc_t* out, co
phase[0] = 3; phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_a_sse2(out, phase_inc, phase, num_points); volk_gnsssdr_s32f_sincos_32fc_a_sse2(out, phase_inc, phase, num_points);
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE2 #ifdef LV_HAVE_SSE2
@ -80,7 +80,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_sse2(lv_32fc_t* out, co
phase[0] = 3; phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_u_sse2(out, phase_inc, phase, num_points); volk_gnsssdr_s32f_sincos_32fc_u_sse2(out, phase_inc, phase, num_points);
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
@ -90,7 +90,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_avx2(lv_32fc_t* out, co
phase[0] = 3; phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_a_avx2(out, phase_inc, phase, num_points); volk_gnsssdr_s32f_sincos_32fc_a_avx2(out, phase_inc, phase, num_points);
} }
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX2 #ifdef LV_HAVE_AVX2
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_avx2(lv_32fc_t* out, co
phase[0] = 3; phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_u_avx2(out, phase_inc, phase, num_points); volk_gnsssdr_s32f_sincos_32fc_u_avx2(out, phase_inc, phase, num_points);
} }
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
@ -110,6 +110,6 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_neon(lv_32fc_t* out, cons
phase[0] = 3; phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_neon(out, phase_inc, phase, num_points); volk_gnsssdr_s32f_sincos_32fc_neon(out, phase_inc, phase, num_points);
} }
#endif /* LV_HAVE_NEON */ #endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */ #endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */

View File

@ -38,32 +38,31 @@
// for puppets we need to get all the func_variants for the puppet and just // for puppets we need to get all the func_variants for the puppet and just
// keep track of the actual function name to write to results // keep track of the actual function name to write to results
#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\ #define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\ volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \
std::string(#puppet_master_func), test_params) std::string(#puppet_master_func), test_params)
#define VOLK_INIT_TEST(func, test_params)\ #define VOLK_INIT_TEST(func, test_params) \
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\ volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \
test_params) test_params)
#define QA(test) test_cases.push_back(test); #define QA(test) test_cases.push_back(test);
std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t test_params) std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t test_params)
{ {
// Some kernels need a lower tolerance // Some kernels need a lower tolerance
volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(), volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
volk_gnsssdr_test_params_t test_params_int1 = volk_gnsssdr_test_params_t(1, test_params.scalar(), volk_gnsssdr_test_params_t test_params_int1 = volk_gnsssdr_test_params_t(1, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
// some others need more iterations ***** ADDED BY GNSS-SDR // some others need more iterations ***** ADDED BY GNSS-SDR
volk_gnsssdr_test_params_t test_params_more_iters = volk_gnsssdr_test_params_t(test_params.tol(), test_params.scalar(), volk_gnsssdr_test_params_t test_params_more_iters = volk_gnsssdr_test_params_t(test_params.tol(), test_params.scalar(),
test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex()); test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex());
// ... or more tolerance ***** ADDED BY GNSS-SDR // ... or more tolerance ***** ADDED BY GNSS-SDR
volk_gnsssdr_test_params_t test_params_int16 = volk_gnsssdr_test_params_t(16, test_params.scalar(), volk_gnsssdr_test_params_t test_params_int16 = volk_gnsssdr_test_params_t(16, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
volk_gnsssdr_test_params_t test_params_inacc2 = volk_gnsssdr_test_params_t(2e-1, test_params.scalar(), volk_gnsssdr_test_params_t test_params_inacc2 = volk_gnsssdr_test_params_t(2e-1, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
std::vector<volk_gnsssdr_test_case_t> test_cases; std::vector<volk_gnsssdr_test_case_t> test_cases;
@ -98,8 +97,7 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16)) QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16))
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16)) QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16))
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1)) QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1))
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1)) QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1));
;
return test_cases; return test_cases;
} }

View File

@ -25,17 +25,18 @@
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t #include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t #include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t
#include <cstdbool> // for bool, false #include <cstdbool> // for bool, false
#include <cstdlib> // for NULL #include <cstdlib> // for NULL
#include <map> // for map #include <map> // for map
#include <string> // for string, basic_string #include <string> // for string, basic_string
#include <vector> // for vector #include <vector> // for vector
/************************************************ /************************************************
* VOLK QA type definitions * * VOLK QA type definitions *
************************************************/ ************************************************/
struct volk_gnsssdr_type_t { struct volk_gnsssdr_type_t
{
bool is_float; bool is_float;
bool is_scalar; bool is_scalar;
bool is_signed; bool is_signed;
@ -44,80 +45,78 @@ struct volk_gnsssdr_type_t {
std::string str; std::string str;
}; };
class volk_gnsssdr_test_time_t { class volk_gnsssdr_test_time_t
public: {
std::string name; public:
double time; std::string name;
std::string units; double time;
bool pass; std::string units;
bool pass;
}; };
class volk_gnsssdr_test_results_t { class volk_gnsssdr_test_results_t
public: {
std::string name; public:
std::string config_name; std::string name;
unsigned int vlen; std::string config_name;
unsigned int iter; unsigned int vlen;
std::map<std::string, volk_gnsssdr_test_time_t> results; unsigned int iter;
std::string best_arch_a; std::map<std::string, volk_gnsssdr_test_time_t> results;
std::string best_arch_u; std::string best_arch_a;
std::string best_arch_u;
}; };
class volk_gnsssdr_test_params_t { class volk_gnsssdr_test_params_t
private: {
float _tol; private:
lv_32fc_t _scalar; float _tol;
unsigned int _vlen; lv_32fc_t _scalar;
unsigned int _iter; unsigned int _vlen;
bool _benchmark_mode; unsigned int _iter;
std::string _kernel_regex; bool _benchmark_mode;
public: std::string _kernel_regex;
// ctor
volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, public:
bool benchmark_mode, std::string kernel_regex) : // ctor
_tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
_benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex) {}; bool benchmark_mode, std::string kernel_regex) : _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex){};
// setters // setters
void set_tol(float tol) {_tol=tol;}; void set_tol(float tol) { _tol = tol; };
void set_scalar(lv_32fc_t scalar) {_scalar=scalar;}; void set_scalar(lv_32fc_t scalar) { _scalar = scalar; };
void set_vlen(unsigned int vlen) {_vlen=vlen;}; void set_vlen(unsigned int vlen) { _vlen = vlen; };
void set_iter(unsigned int iter) {_iter=iter;}; void set_iter(unsigned int iter) { _iter = iter; };
void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;}; void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; };
void set_regex(std::string regex) {_kernel_regex=regex;}; void set_regex(std::string regex) { _kernel_regex = regex; };
// getters // getters
float tol() {return _tol;}; float tol() { return _tol; };
lv_32fc_t scalar() {return _scalar;}; lv_32fc_t scalar() { return _scalar; };
unsigned int vlen() {return _vlen;}; unsigned int vlen() { return _vlen; };
unsigned int iter() {return _iter;}; unsigned int iter() { return _iter; };
bool benchmark_mode() {return _benchmark_mode;}; bool benchmark_mode() { return _benchmark_mode; };
std::string kernel_regex() {return _kernel_regex;}; std::string kernel_regex() { return _kernel_regex; };
}; };
class volk_gnsssdr_test_case_t { class volk_gnsssdr_test_case_t
private: {
volk_gnsssdr_func_desc_t _desc; private:
void(*_kernel_ptr)(); volk_gnsssdr_func_desc_t _desc;
std::string _name; void (*_kernel_ptr)();
volk_gnsssdr_test_params_t _test_parameters; std::string _name;
std::string _puppet_master_name; volk_gnsssdr_test_params_t _test_parameters;
public: std::string _puppet_master_name;
volk_gnsssdr_func_desc_t desc() {return _desc;};
void (*kernel_ptr()) () {return _kernel_ptr;}; public:
std::string name() {return _name;}; volk_gnsssdr_func_desc_t desc() { return _desc; };
std::string puppet_master_name() {return _puppet_master_name;}; void (*kernel_ptr())() { return _kernel_ptr; };
volk_gnsssdr_test_params_t test_parameters() {return _test_parameters;}; std::string name() { return _name; };
// normal ctor std::string puppet_master_name() { return _puppet_master_name; };
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name, volk_gnsssdr_test_params_t test_parameters() { return _test_parameters; };
volk_gnsssdr_test_params_t test_parameters) : // normal ctor
_desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name,
_puppet_master_name("NULL") volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name("NULL"){};
{}; // ctor for puppets
// ctor for puppets volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name,
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name, std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name(puppet_master_name){};
std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) :
_desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
_puppet_master_name(puppet_master_name)
{};
}; };
/************************************************ /************************************************
@ -130,58 +129,57 @@ void random_floats(float *buf, unsigned n);
bool run_volk_gnsssdr_tests( bool run_volk_gnsssdr_tests(
volk_gnsssdr_func_desc_t, volk_gnsssdr_func_desc_t,
void(*)(), void (*)(),
std::string, std::string,
volk_gnsssdr_test_params_t, volk_gnsssdr_test_params_t,
std::vector<volk_gnsssdr_test_results_t> *results = NULL, std::vector<volk_gnsssdr_test_results_t> *results = NULL,
std::string puppet_master_name = "NULL" std::string puppet_master_name = "NULL");
);
bool run_volk_gnsssdr_tests( bool run_volk_gnsssdr_tests(
volk_gnsssdr_func_desc_t, volk_gnsssdr_func_desc_t,
void(*)(), void (*)(),
std::string, std::string,
float, float,
lv_32fc_t, lv_32fc_t,
unsigned int, unsigned int,
unsigned int, unsigned int,
std::vector<volk_gnsssdr_test_results_t> *results = NULL, std::vector<volk_gnsssdr_test_results_t> *results = NULL,
std::string puppet_master_name = "NULL", std::string puppet_master_name = "NULL",
bool benchmark_mode = false bool benchmark_mode = false);
);
#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ #define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
BOOST_AUTO_TEST_CASE(func##_test) { \ BOOST_AUTO_TEST_CASE(func##_test) \
BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ { \
func##_get_func_desc(), (void (*)())func##_manual, \ BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ func##_get_func_desc(), (void (*)())func##_manual, \
0); \ std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
0); \
} }
#define VOLK_PROFILE(func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL") #define VOLK_PROFILE(func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL")
#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func)) #define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func))
typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char *); //one input, operate in place
typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char *); //one input vector, one scalar float input
typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char *); //one input vector, one scalar float input
typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char *);
//ADDED BY GNSS-SDR. START //ADDED BY GNSS-SDR. START
typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char *); //one input vector, one scalar char input
typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char *); //one input vector, one scalar lv_8sc_t vector input
typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char*); //one input vector, one scalar lv_16sc_t vector input typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char *); //one input vector, one scalar lv_16sc_t vector input
typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char*); typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char *);
//ADDED BY GNSS-SDR. END //ADDED BY GNSS-SDR. END
#endif // GNSS_SDR_VOLK_QA_UTILS_H #endif // GNSS_SDR_VOLK_QA_UTILS_H

View File

@ -18,16 +18,16 @@
*/ */
#include "kernel_tests.h" // for init_test_list #include "kernel_tests.h" // for init_test_list
#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t #include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t #include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
#include <cstdbool> // for bool, false, true #include <cstdbool> // for bool, false, true
#include <iostream> // for operator<<, basic_ostream, endl, char... #include <iostream> // for operator<<, basic_ostream, endl, char...
#include <fstream> // IWYU pragma: keep #include <fstream> // IWYU pragma: keep
#include <map> // for map, map<>::iterator, _Rb_tree_iterator #include <map> // for map, map<>::iterator, _Rb_tree_iterator
#include <string> // for string, operator<< #include <string> // for string, operator<<
#include <utility> // for pair #include <utility> // for pair
#include <vector> // for vector #include <vector> // for vector
void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int nfails); void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int nfails);
@ -49,38 +49,44 @@ int main()
std::vector<std::string> qa_failures; std::vector<std::string> qa_failures;
std::vector<volk_gnsssdr_test_results_t> results; std::vector<volk_gnsssdr_test_results_t> results;
// Test every kernel reporting failures when they occur // Test every kernel reporting failures when they occur
for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { for (unsigned int ii = 0; ii < test_cases.size(); ++ii)
bool qa_result = false; {
volk_gnsssdr_test_case_t test_case = test_cases[ii]; bool qa_result = false;
try { volk_gnsssdr_test_case_t test_case = test_cases[ii];
qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), try
test_case.test_parameters(), &results, test_case.puppet_master_name()); {
} qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
catch(...) { test_case.test_parameters(), &results, test_case.puppet_master_name());
// TODO: what exceptions might we need to catch and how do we handle them? }
std::cerr << "Exception found on kernel: " << test_case.name() << std::endl; catch (...)
qa_result = false; {
} // TODO: what exceptions might we need to catch and how do we handle them?
std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
qa_result = false;
}
if(qa_result) { if (qa_result)
std::cerr << "Failure on " << test_case.name() << std::endl; {
qa_failures.push_back(test_case.name()); std::cerr << "Failure on " << test_case.name() << std::endl;
qa_failures.push_back(test_case.name());
}
} }
}
// Generate XML results // Generate XML results
print_qa_xml(results, qa_failures.size()); print_qa_xml(results, qa_failures.size());
// Summarize QA results // Summarize QA results
std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of " std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
<< test_cases.size() << " tests." << std::endl; << test_cases.size() << " tests." << std::endl;
if(qa_failures.size() > 0) { if (qa_failures.size() > 0)
std::cerr << "The following kernels failed QA:" << std::endl; {
for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) { std::cerr << "The following kernels failed QA:" << std::endl;
std::cerr << " " << qa_failures[ii] << std::endl; for (unsigned int ii = 0; ii < qa_failures.size(); ++ii)
{
std::cerr << " " << qa_failures[ii] << std::endl;
}
qa_ret_val = 1;
} }
qa_ret_val = 1;
}
return qa_ret_val; return qa_ret_val;
} }
@ -95,34 +101,34 @@ void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int
qa_file.open(".unittest/kernels.xml"); qa_file.open(".unittest/kernels.xml");
qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl; qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
qa_file << "<testsuites name=\"kernels\" " << qa_file << "<testsuites name=\"kernels\" "
"tests=\"" << results.size() << "\" " << << "tests=\"" << results.size() << "\" "
"failures=\"" << nfails << "\" id=\"1\">" << std::endl; << "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
// Results are in a vector by kernel. Each element has a result // Results are in a vector by kernel. Each element has a result
// map containing time and arch name with test result // map containing time and arch name with test result
for(unsigned int ii=0; ii < results.size(); ++ii) { for (unsigned int ii = 0; ii < results.size(); ++ii)
volk_gnsssdr_test_results_t result = results[ii]; {
qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl; volk_gnsssdr_test_results_t result = results[ii];
qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair; std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) { for (kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair)
volk_gnsssdr_test_time_t test_time = kernel_time_pair->second; {
qa_file << " <testcase name=\"" << test_time.name << "\" " << volk_gnsssdr_test_time_t test_time = kernel_time_pair->second;
"classname=\"" << result.name << "\" " << qa_file << " <testcase name=\"" << test_time.name << "\" "
"time=\"" << test_time.time << "\">" << std::endl; << "classname=\"" << result.name << "\" "
if(!test_time.pass) << "time=\"" << test_time.time << "\">" << std::endl;
qa_file << " <failure " << if (!test_time.pass)
"message=\"fail on arch " << test_time.name << "\">" << qa_file << " <failure "
"</failure>" << std::endl; << "message=\"fail on arch " << test_time.name << "\">"
qa_file << " </testcase>" << std::endl; << "</failure>" << std::endl;
qa_file << " </testcase>" << std::endl;
}
qa_file << " </testsuite>" << std::endl;
} }
qa_file << " </testsuite>" << std::endl;
}
qa_file << "</testsuites>" << std::endl; qa_file << "</testsuites>" << std::endl;
qa_file.close(); qa_file.close();
} }

View File

@ -43,15 +43,16 @@ void *volk_gnsssdr_malloc(size_t size, size_t alignment)
return malloc(size); return malloc(size);
int err = posix_memalign(&ptr, alignment, size); int err = posix_memalign(&ptr, alignment, size);
if(err == 0) if (err == 0)
{ {
return ptr; return ptr;
} }
else else
{ {
fprintf(stderr, fprintf(stderr,
"VOLK_GNSSSDR: Error allocating memory " "VOLK_GNSSSDR: Error allocating memory "
"(posix_memalign: error %d: %s)\n", err, strerror(err)); "(posix_memalign: error %d: %s)\n",
err, strerror(err));
return NULL; return NULL;
} }
} }
@ -68,7 +69,7 @@ void volk_gnsssdr_free(void *ptr)
void *volk_gnsssdr_malloc(size_t size, size_t alignment) void *volk_gnsssdr_malloc(size_t size, size_t alignment)
{ {
void *ptr = _aligned_malloc(size, alignment); void *ptr = _aligned_malloc(size, alignment);
if(ptr == NULL) if (ptr == NULL)
{ {
fprintf(stderr, "VOLK_GNSSSDR: Error allocating memory (_aligned_malloc)\n"); fprintf(stderr, "VOLK_GNSSSDR: Error allocating memory (_aligned_malloc)\n");
} }
@ -81,7 +82,7 @@ void volk_gnsssdr_free(void *ptr)
} }
// No standard handlers; we'll do it ourselves. // No standard handlers; we'll do it ourselves.
#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN #else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
struct block_info struct block_info
{ {
@ -102,7 +103,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment)
real = malloc(size + (2 * alignment - 1)); real = malloc(size + (2 * alignment - 1));
/* Get pointer to the various zones */ /* Get pointer to the various zones */
user = (void *)((((uintptr_t) real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1)); user = (void *)((((uintptr_t)real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1));
info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info)); info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info));
/* Store the info for the free */ /* Store the info for the free */
@ -112,8 +113,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment)
return user; return user;
} }
void void volk_gnsssdr_free(void *ptr)
volk_gnsssdr_free(void *ptr)
{ {
struct block_info *info; struct block_info *info;
@ -124,6 +124,6 @@ volk_gnsssdr_free(void *ptr)
free(info->real); free(info->real);
} }
#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN #endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
//#endif // _ISOC11_SOURCE //#endif // _ISOC11_SOURCE

View File

@ -26,16 +26,17 @@ void volk_gnsssdr_get_config_path(char *path)
{ {
if (!path) return; if (!path) return;
const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config"; const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config";
const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; //non-hidden const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; // non-hidden
char *home = NULL; char *home = NULL;
//allows config redirection via env variable //allows config redirection via env variable
home = getenv("VOLK_CONFIGPATH"); home = getenv("VOLK_CONFIGPATH");
if(home!=NULL){ if (home != NULL)
strncpy(path,home,512); {
strcat(path,suffix2); strncpy(path, home, 512);
return; strcat(path, suffix2);
} return;
}
if (home == NULL) home = getenv("HOME"); if (home == NULL) home = getenv("HOME");
if (home == NULL) home = getenv("APPDATA"); if (home == NULL) home = getenv("APPDATA");
@ -57,16 +58,16 @@ size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **prefs_res)
//get the config path //get the config path
volk_gnsssdr_get_config_path(path); volk_gnsssdr_get_config_path(path);
if (!path[0]) return n_arch_prefs; //no prefs found if (!path[0]) return n_arch_prefs; //no prefs found
config_file = fopen(path, "r"); config_file = fopen(path, "r");
if(!config_file) return n_arch_prefs; //no prefs found if (!config_file) return n_arch_prefs; //no prefs found
//reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs //reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs
while(fgets(line, sizeof(line), config_file) != NULL) while (fgets(line, sizeof(line), config_file) != NULL)
{ {
prefs = (volk_gnsssdr_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs)); prefs = (volk_gnsssdr_arch_pref_t *)realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs; volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs;
if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5)) if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5))
{ {
n_arch_prefs++; n_arch_prefs++;
} }

View File

@ -29,7 +29,7 @@
inline unsigned __popcnt(unsigned num) inline unsigned __popcnt(unsigned num)
{ {
unsigned pop = 0; unsigned pop = 0;
while(num) while (num)
{ {
if (num & 0x1) pop++; if (num & 0x1) pop++;
num >>= 1; num >>= 1;
@ -39,15 +39,15 @@ inline unsigned __popcnt(unsigned num)
#endif #endif
int volk_gnsssdr_get_index( int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find const char *impl_name //the implementation name to find
) )
{ {
unsigned int i; unsigned int i;
for (i = 0; i < n_impls; i++) for (i = 0; i < n_impls; i++)
{ {
if(!strncmp(impl_names[i], impl_name, 20)) if (!strncmp(impl_names[i], impl_name, 20))
{ {
return i; return i;
} }
@ -55,24 +55,24 @@ int volk_gnsssdr_get_index(
//TODO return -1; //TODO return -1;
//something terrible should happen here //something terrible should happen here
fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n"); fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n");
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
} }
int volk_gnsssdr_rank_archs( int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name const char *impl_names[], //list of implementations by name
const int* impl_deps, //requirement mask per implementation const int *impl_deps, //requirement mask per implementation
const bool* alignment, //alignment status of each implementation const bool *alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations const bool align //if false, filter aligned implementations
) )
{ {
size_t i; size_t i;
static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs; static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs;
static size_t n_arch_prefs = 0; static size_t n_arch_prefs = 0;
static int prefs_loaded = 0; static int prefs_loaded = 0;
if(!prefs_loaded) if (!prefs_loaded)
{ {
n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs); n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs);
prefs_loaded = 1; prefs_loaded = 1;
@ -81,17 +81,17 @@ int volk_gnsssdr_rank_archs(
// If we've defined VOLK_GENERIC to be anything, always return the // If we've defined VOLK_GENERIC to be anything, always return the
// 'generic' kernel. Used in GR's QA code. // 'generic' kernel. Used in GR's QA code.
char *gen_env = getenv("VOLK_GENERIC"); char *gen_env = getenv("VOLK_GENERIC");
if(gen_env) if (gen_env)
{ {
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
} }
//now look for the function name in the prefs list //now look for the function name in the prefs list
for(i = 0; i < n_arch_prefs; i++) for (i = 0; i < n_arch_prefs; i++)
{ {
if(!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it
{ {
const char *impl_name = align? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u; const char *impl_name = align ? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u;
return volk_gnsssdr_get_index(impl_names, n_impls, impl_name); return volk_gnsssdr_get_index(impl_names, n_impls, impl_name);
} }
} }
@ -101,7 +101,7 @@ int volk_gnsssdr_rank_archs(
size_t best_index_u = 0; size_t best_index_u = 0;
int best_value_a = -1; int best_value_a = -1;
int best_value_u = -1; int best_value_u = -1;
for(i = 0; i < n_impls; i++) for (i = 0; i < n_impls; i++)
{ {
const signed val = __popcnt(impl_deps[i]); const signed val = __popcnt(impl_deps[i]);
if (alignment[i] && val > best_value_a) if (alignment[i] && val > best_value_a)

View File

@ -23,23 +23,24 @@
#include <stdbool.h> #include <stdbool.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
int volk_gnsssdr_get_index( int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find const char *impl_name //the implementation name to find
); );
int volk_gnsssdr_rank_archs( int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name const char *impl_names[], //list of implementations by name
const int* impl_deps, //requirement mask per implementation const int *impl_deps, //requirement mask per implementation
const bool* alignment, //alignment status of each implementation const bool *alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations const bool align //if false, filter aligned implementations
); );
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -31,80 +31,90 @@ static intptr_t __alignment_mask = 0;
struct volk_gnsssdr_machine *get_machine(void) struct volk_gnsssdr_machine *get_machine(void)
{ {
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines; extern unsigned int n_volk_gnsssdr_machines;
static struct volk_gnsssdr_machine *machine = NULL; static struct volk_gnsssdr_machine *machine = NULL;
if(machine != NULL) if (machine != NULL)
return machine; return machine;
else { else
unsigned int max_score = 0; {
unsigned int i; unsigned int max_score = 0;
struct volk_gnsssdr_machine *max_machine = NULL; unsigned int i;
for(i=0; i<n_volk_gnsssdr_machines; i++) { struct volk_gnsssdr_machine *max_machine = NULL;
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) { for (i = 0; i < n_volk_gnsssdr_machines; i++)
if(volk_gnsssdr_machines[i]->caps > max_score) { {
max_score = volk_gnsssdr_machines[i]->caps; if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
max_machine = volk_gnsssdr_machines[i]; {
if (volk_gnsssdr_machines[i]->caps > max_score)
{
max_score = volk_gnsssdr_machines[i]->caps;
max_machine = volk_gnsssdr_machines[i];
}
}
}
machine = max_machine;
//printf("Using Volk machine: %s\n", machine->name);
__alignment = machine->alignment;
__alignment_mask = (intptr_t)(__alignment - 1);
return machine;
} }
}
}
machine = max_machine;
//printf("Using Volk machine: %s\n", machine->name);
__alignment = machine->alignment;
__alignment_mask = (intptr_t)(__alignment-1);
return machine;
}
} }
void volk_gnsssdr_list_machines(void) void volk_gnsssdr_list_machines(void)
{ {
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines; extern unsigned int n_volk_gnsssdr_machines;
unsigned int i; unsigned int i;
for(i=0; i<n_volk_gnsssdr_machines; i++) { for (i = 0; i < n_volk_gnsssdr_machines; i++)
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) { {
printf("%s;", volk_gnsssdr_machines[i]->name); if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
} {
} printf("%s;", volk_gnsssdr_machines[i]->name);
printf("\n"); }
}
printf("\n");
} }
const char* volk_gnsssdr_get_machine(void) const char *volk_gnsssdr_get_machine(void)
{ {
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[]; extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines; extern unsigned int n_volk_gnsssdr_machines;
static struct volk_gnsssdr_machine *machine = NULL; static struct volk_gnsssdr_machine *machine = NULL;
if(machine != NULL) if (machine != NULL)
return machine->name; return machine->name;
else { else
unsigned int max_score = 0; {
unsigned int i; unsigned int max_score = 0;
struct volk_gnsssdr_machine *max_machine = NULL; unsigned int i;
for(i=0; i<n_volk_gnsssdr_machines; i++) { struct volk_gnsssdr_machine *max_machine = NULL;
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) { for (i = 0; i < n_volk_gnsssdr_machines; i++)
if(volk_gnsssdr_machines[i]->caps > max_score) { {
max_score = volk_gnsssdr_machines[i]->caps; if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
max_machine = volk_gnsssdr_machines[i]; {
if (volk_gnsssdr_machines[i]->caps > max_score)
{
max_score = volk_gnsssdr_machines[i]->caps;
max_machine = volk_gnsssdr_machines[i];
}
}
}
machine = max_machine;
return machine->name;
} }
}
}
machine = max_machine;
return machine->name;
}
} }
size_t volk_gnsssdr_get_alignment(void) size_t volk_gnsssdr_get_alignment(void)
{ {
get_machine(); //ensures alignment is set get_machine(); //ensures alignment is set
return __alignment; return __alignment;
} }
bool volk_gnsssdr_is_aligned(const void *ptr) bool volk_gnsssdr_is_aligned(const void *ptr)
{ {
return ((intptr_t)(ptr) & __alignment_mask) == 0; return ((intptr_t)(ptr)&__alignment_mask) == 0;
} }
#define LV_HAVE_GENERIC #define LV_HAVE_GENERIC
@ -113,13 +123,12 @@ bool volk_gnsssdr_is_aligned(const void *ptr)
%for kern in kernels: %for kern in kernels:
%if kern.has_dispatcher: %if kern.has_dispatcher:
#include <volk_gnsssdr/${kern.name}.h> //pulls in the dispatcher #include <volk_gnsssdr/${kern.name}.h> //pulls in the dispatcher
%endif %endif
static inline void __${kern.name}_d(${kern.arglist_full}) static inline void __${kern.name}_d(${kern.arglist_full})
{ {
%if kern.has_dispatcher: % if kern.has_dispatcher : ${kern.name} _dispatcher(${kern.arglist_names});
${kern.name}_dispatcher(${kern.arglist_names});
return; return;
%endif %endif
@ -131,41 +140,41 @@ static inline void __${kern.name}_d(${kern.arglist_full})
%endfor %endfor
0<% end_open_parens = ')'*num_open_parens %>${end_open_parens} 0<% end_open_parens = ')'*num_open_parens %>${end_open_parens}
)){ )){
${kern.name}_a(${kern.arglist_names}); ${kern.name} _a(${kern.arglist_names});
} }
else{ else{
${kern.name}_u(${kern.arglist_names}); ${kern.name} _u(${kern.arglist_names});
} }
} }
static inline void __init_${kern.name}(void) static inline void __init_${kern.name}(void)
{ {
const char *name = get_machine()->${kern.name}_name; const char *name = get_machine()->${kern.name} _name;
const char **impl_names = get_machine()->${kern.name}_impl_names; const char **impl_names = get_machine()->${kern.name} _impl_names;
const int *impl_deps = get_machine()->${kern.name}_impl_deps; const int *impl_deps = get_machine()->${kern.name} _impl_deps;
const bool *alignment = get_machine()->${kern.name}_impl_alignment; const bool *alignment = get_machine()->${kern.name} _impl_alignment;
const size_t n_impls = get_machine()->${kern.name}_n_impls; const size_t n_impls = get_machine()->${kern.name} _n_impls;
const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/); const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true /*aligned*/);
const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/); const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false /*unaligned*/);
${kern.name}_a = get_machine()->${kern.name}_impls[index_a]; ${kern.name} _a = get_machine()->${kern.name} _impls[index_a];
${kern.name}_u = get_machine()->${kern.name}_impls[index_u]; ${kern.name} _u = get_machine()->${kern.name} _impls[index_u];
assert(${kern.name}_a); assert(${kern.name} _a);
assert(${kern.name}_u); assert(${kern.name} _u);
${kern.name} = &__${kern.name}_d; ${kern.name} = &__${kern.name} _d;
} }
static inline void __${kern.name}_a(${kern.arglist_full}) static inline void __${kern.name} _a(${kern.arglist_full})
{ {
__init_${kern.name}(); __init_${kern.name}();
${kern.name}_a(${kern.arglist_names}); ${kern.name} _a(${kern.arglist_names});
} }
static inline void __${kern.name}_u(${kern.arglist_full}) static inline void __${kern.name} _u(${kern.arglist_full})
{ {
__init_${kern.name}(); __init_${kern.name}();
${kern.name}_u(${kern.arglist_names}); ${kern.name} _u(${kern.arglist_names});
} }
static inline void __${kern.name}(${kern.arglist_full}) static inline void __${kern.name}(${kern.arglist_full})
@ -174,34 +183,32 @@ static inline void __${kern.name}(${kern.arglist_full})
${kern.name}(${kern.arglist_names}); ${kern.name}(${kern.arglist_names});
} }
${kern.pname} ${kern.name}_a = &__${kern.name}_a; ${kern.pname} ${kern.name} _a = &__${kern.name} _a;
${kern.pname} ${kern.name}_u = &__${kern.name}_u; ${kern.pname} ${kern.name} _u = &__${kern.name} _u;
${kern.pname} ${kern.name} = &__${kern.name}; ${kern.pname} ${kern.name} = &__${kern.name};
void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name) void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name)
{ {
const int index = volk_gnsssdr_get_index( const int index = volk_gnsssdr_get_index(
get_machine()->${kern.name}_impl_names, get_machine()->${kern.name} _impl_names,
get_machine()->${kern.name}_n_impls, get_machine()->${kern.name} _n_impls,
impl_name impl_name);
); get_machine()->${kern.name} _impls[index](
get_machine()->${kern.name}_impls[index]( ${kern.arglist_names});
${kern.arglist_names}
);
} }
volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) { volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void)
const char **impl_names = get_machine()->${kern.name}_impl_names; {
const int *impl_deps = get_machine()->${kern.name}_impl_deps; const char **impl_names = get_machine()->${kern.name} _impl_names;
const bool *alignment = get_machine()->${kern.name}_impl_alignment; const int *impl_deps = get_machine()->${kern.name} _impl_deps;
const size_t n_impls = get_machine()->${kern.name}_n_impls; const bool *alignment = get_machine()->${kern.name} _impl_alignment;
const size_t n_impls = get_machine()->${kern.name} _n_impls;
volk_gnsssdr_func_desc_t desc = { volk_gnsssdr_func_desc_t desc = {
impl_names, impl_names,
impl_deps, impl_deps,
alignment, alignment,
n_impls n_impls};
};
return desc; return desc;
} }
%endfor % endfor

View File

@ -42,7 +42,7 @@ typedef struct volk_gnsssdr_func_desc
VOLK_API void volk_gnsssdr_list_machines(void); VOLK_API void volk_gnsssdr_list_machines(void);
//! Returns the name of the machine this instance will use //! Returns the name of the machine this instance will use
VOLK_API const char* volk_gnsssdr_get_machine(void); VOLK_API const char *volk_gnsssdr_get_machine(void);
//! Get the machine alignment in bytes //! Get the machine alignment in bytes
VOLK_API size_t volk_gnsssdr_get_alignment(void); VOLK_API size_t volk_gnsssdr_get_alignment(void);
@ -74,19 +74,19 @@ VOLK_API bool volk_gnsssdr_is_aligned(const void *ptr);
extern VOLK_API ${kern.pname} ${kern.name}; extern VOLK_API ${kern.pname} ${kern.name};
//! A function pointer to the fastest aligned implementation //! A function pointer to the fastest aligned implementation
extern VOLK_API ${kern.pname} ${kern.name}_a; extern VOLK_API ${kern.pname} ${kern.name} _a;
//! A function pointer to the fastest unaligned implementation //! A function pointer to the fastest unaligned implementation
extern VOLK_API ${kern.pname} ${kern.name}_u; extern VOLK_API ${kern.pname} ${kern.name} _u;
//! Call into a specific implementation given by name //! Call into a specific implementation given by name
extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name); extern VOLK_API void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name);
//! Get description parameters for this kernel //! Get description parameters for this kernel
extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void); extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void);
%endfor % endfor
__VOLK_DECL_END __VOLK_DECL_END
#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/ #endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/

View File

@ -21,7 +21,8 @@
%for i, arch in enumerate(archs): %for i, arch in enumerate(archs):
//#ifndef LV_${arch.name.upper()} //#ifndef LV_${arch.name.upper()}
#define LV_${arch.name.upper()} ${i} #define LV_$ \
{arch.name.upper()} $ { i }
//#endif //#endif
%endfor %endfor

View File

@ -24,50 +24,54 @@
struct VOLK_CPU volk_gnsssdr_cpu; struct VOLK_CPU volk_gnsssdr_cpu;
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#define VOLK_CPU_x86 #define VOLK_CPU_x86
#endif #endif
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
//implement get cpuid for gcc compilers using a system or local copy of cpuid.h //implement get cpuid for gcc compilers using a system or local copy of cpuid.h
#if defined(__GNUC__) #if defined(__GNUC__)
#include <cpuid.h> #include <cpuid.h>
#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3) #define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r + 0, (unsigned int *)r + 1, (unsigned int *)r + 2, (unsigned int *)r + 3)
#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int*)regs), *((unsigned int*)regs+1), *((unsigned int*)regs+2), *((unsigned int*)regs+3)) #define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int *)regs), *((unsigned int *)regs + 1), *((unsigned int *)regs + 2), *((unsigned int *)regs + 3))
/* Return Intel AVX extended CPU capabilities register. /* Return Intel AVX extended CPU capabilities register.
* This function will bomb on non-AVX-capable machines, so * This function will bomb on non-AVX-capable machines, so
* check for AVX capability before executing. * check for AVX capability before executing.
*/ */
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV) #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
static inline unsigned long long _xgetbv(unsigned int index){ static inline unsigned long long _xgetbv(unsigned int index)
unsigned int eax, edx; {
__VOLK_ASM __VOLK_VOLATILE ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); unsigned int eax, edx;
return ((unsigned long long)edx << 32) | eax; __VOLK_ASM __VOLK_VOLATILE("xgetbv"
} : "=a"(eax), "=d"(edx)
#define __xgetbv() _xgetbv(0) : "c"(index));
#else return ((unsigned long long)edx << 32) | eax;
#define __xgetbv() 0 }
#endif #define __xgetbv() _xgetbv(0)
#else
#define __xgetbv() 0
#endif
//implement get cpuid for MSVC compilers using __cpuid intrinsic //implement get cpuid for MSVC compilers using __cpuid intrinsic
#elif defined(_MSC_VER) && defined(HAVE_INTRIN_H) #elif defined(_MSC_VER) && defined(HAVE_INTRIN_H)
#include <intrin.h> #include <intrin.h>
#define cpuid_x86(op, r) __cpuid(((int*)r), op) #define cpuid_x86(op, r) __cpuid(((int *)r), op)
#if defined(_XCR_XFEATURE_ENABLED_MASK) #if defined(_XCR_XFEATURE_ENABLED_MASK)
#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK) #define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
#else #else
#define __xgetbv() 0 #define __xgetbv() 0
#endif #endif
#else #else
#error "A get cpuid for volk_gnsssdr is not available on this compiler..." #error "A get cpuid for volk_gnsssdr is not available on this compiler..."
#endif //defined(__GNUC__) #endif //defined(__GNUC__)
#endif //defined(VOLK_CPU_x86) #endif //defined(VOLK_CPU_x86)
static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) { static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
unsigned int regs[4] = {0}; unsigned int regs[4] = {0};
cpuid_x86_count(level, count, regs); cpuid_x86_count(level, count, regs);
@ -77,10 +81,11 @@ static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int
#endif #endif
} }
static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) { static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
unsigned int regs[4]; unsigned int regs[4];
memset(regs, 0, sizeof(unsigned int)*4); memset(regs, 0, sizeof(unsigned int) * 4);
cpuid_x86(op, regs); cpuid_x86(op, regs);
return regs[reg] >> bit & 0x01; return regs[reg] >> bit & 0x01;
#else #else
@ -88,10 +93,11 @@ static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsi
#endif #endif
} }
static inline unsigned int check_extended_cpuid(unsigned int val) { static inline unsigned int check_extended_cpuid(unsigned int val)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
unsigned int regs[4]; unsigned int regs[4];
memset(regs, 0, sizeof(unsigned int)*4); memset(regs, 0, sizeof(unsigned int) * 4);
cpuid_x86(0x80000000, regs); cpuid_x86(0x80000000, regs);
return regs[0] >= val; return regs[0] >= val;
#else #else
@ -99,7 +105,8 @@ static inline unsigned int check_extended_cpuid(unsigned int val) {
#endif #endif
} }
static inline unsigned int get_avx_enabled(void) { static inline unsigned int get_avx_enabled(void)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
return __xgetbv() & 0x6; return __xgetbv() & 0x6;
#else #else
@ -107,7 +114,8 @@ static inline unsigned int get_avx_enabled(void) {
#endif #endif
} }
static inline unsigned int get_avx2_enabled(void) { static inline unsigned int get_avx2_enabled(void)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
return __xgetbv() & 0x6; return __xgetbv() & 0x6;
#else #else
@ -117,28 +125,30 @@ static inline unsigned int get_avx2_enabled(void) {
//neon detection is linux specific //neon detection is linux specific
#if defined(__arm__) && defined(__linux__) #if defined(__arm__) && defined(__linux__)
#include <asm/hwcap.h> #include <asm/hwcap.h>
#include <linux/auxvec.h> #include <linux/auxvec.h>
#include <stdio.h> #include <stdio.h>
#define VOLK_CPU_ARM #define VOLK_CPU_ARM
#endif #endif
static int has_neon(void){ static int has_neon(void)
{
#if defined(VOLK_CPU_ARM) #if defined(VOLK_CPU_ARM)
FILE *auxvec_f; FILE *auxvec_f;
unsigned long auxvec[2]; unsigned long auxvec[2];
unsigned int found_neon = 0; unsigned int found_neon = 0;
auxvec_f = fopen("/proc/self/auxv", "rb"); auxvec_f = fopen("/proc/self/auxv", "rb");
if(!auxvec_f) return 0; if (!auxvec_f) return 0;
size_t r = 1; size_t r = 1;
//so auxv is basically 32b of ID and 32b of value //so auxv is basically 32b of ID and 32b of value
//so it goes like this //so it goes like this
while(!found_neon && r) { while (!found_neon && r)
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f); {
if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
found_neon = 1; if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
} found_neon = 1;
}
fclose(auxvec_f); fclose(auxvec_f);
return found_neon; return found_neon;
@ -148,50 +158,59 @@ static int has_neon(void){
} }
%for arch in archs: %for arch in archs:
static int i_can_has_${arch.name} (void) { static int i_can_has_${arch.name} (void)
{
%for check, params in arch.checks: %for check, params in arch.checks:
if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0; if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0;
%endfor % endfor return 1;
return 1;
} }
%endfor % endfor
#if defined(HAVE_FENV_H) #if defined(HAVE_FENV_H)
#if defined(FE_TONEAREST) #if defined(FE_TONEAREST)
#include <fenv.h> #include <fenv.h>
static inline void set_float_rounding(void){ static inline void
fesetround(FE_TONEAREST); set_float_rounding(void)
} {
#else fesetround(FE_TONEAREST);
static inline void set_float_rounding(void){ }
//do nothing
}
#endif
#elif defined(_MSC_VER)
#include <float.h>
static inline void set_float_rounding(void){
unsigned int cwrd;
_controlfp_s(&cwrd, 0, 0);
_controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
}
#else #else
static inline void set_float_rounding(void){ static inline void
//do nothing set_float_rounding(void)
} {
//do nothing
}
#endif
#elif defined(_MSC_VER)
#include <float.h>
static inline void
set_float_rounding(void)
{
unsigned int cwrd;
_controlfp_s(&cwrd, 0, 0);
_controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
}
#else
static inline void
set_float_rounding(void)
{
//do nothing
}
#endif #endif
void volk_gnsssdr_cpu_init() { void volk_gnsssdr_cpu_init()
{
%for arch in archs: %for arch in archs:
volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name}; volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name};
%endfor % endfor
set_float_rounding(); set_float_rounding();
} }
unsigned int volk_gnsssdr_get_lvarch() { unsigned int volk_gnsssdr_get_lvarch()
{
unsigned int retval = 0; unsigned int retval = 0;
volk_gnsssdr_cpu_init(); volk_gnsssdr_cpu_init();
%for arch in archs: %for arch in archs:
retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()}; retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()};
%endfor % endfor return retval;
return retval;
} }

View File

@ -23,16 +23,17 @@
__VOLK_DECL_BEGIN __VOLK_DECL_BEGIN
struct VOLK_CPU { struct VOLK_CPU
{
%for arch in archs: %for arch in archs:
int (*has_${arch.name}) (); int (*has_${arch.name}) ();
%endfor % endfor
}; };
extern struct VOLK_CPU volk_gnsssdr_cpu; extern struct VOLK_CPU volk_gnsssdr_cpu;
void volk_gnsssdr_cpu_init (); void volk_gnsssdr_cpu_init();
unsigned int volk_gnsssdr_get_lvarch (); unsigned int volk_gnsssdr_get_lvarch();
__VOLK_DECL_END __VOLK_DECL_END

View File

@ -20,7 +20,11 @@
<% arch_names = this_machine.arch_names %> <% arch_names = this_machine.arch_names %>
%for arch in this_machine.archs: %for arch in this_machine.archs:
#define LV_HAVE_${arch.name.upper()} 1 #define LV_HAVE_$ \
{ \
arch.name.upper() \
} \
1
%endfor %endfor
#include <volk_gnsssdr/volk_gnsssdr_common.h> #include <volk_gnsssdr/volk_gnsssdr_common.h>
@ -35,7 +39,9 @@
#include <volk_gnsssdr/${kern.name}.h> #include <volk_gnsssdr/${kern.name}.h>
%endfor %endfor
struct volk_gnsssdr_machine volk_gnsssdr_machine_${this_machine.name} = { struct volk_gnsssdr_machine volk_gnsssdr_machine_$
{
this_machine.name} = {
<% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list}, <% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list},
<% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name}, <% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name},
${this_machine.alignment}, ${this_machine.alignment},

View File

@ -22,10 +22,10 @@
struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = { struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = {
%for machine in machines: %for machine in machines:
#ifdef LV_MACHINE_${machine.name.upper()} #ifdef LV_MACHINE_${machine.name.upper() }
&volk_gnsssdr_machine_${machine.name}, &volk_gnsssdr_machine_${machine.name},
#endif #endif
%endfor %endfor
}; };
unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines)/sizeof(*volk_gnsssdr_machines); unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines) / sizeof(*volk_gnsssdr_machines);

View File

@ -27,26 +27,30 @@
__VOLK_DECL_BEGIN __VOLK_DECL_BEGIN
struct volk_gnsssdr_machine { struct volk_gnsssdr_machine
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format) {
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format)
const char *name; const char *name;
const size_t alignment; //the maximum byte alignment required for functions in this library const size_t alignment; //the maximum byte alignment required for functions in this library
%for kern in kernels: %for kern in kernels:
const char *${kern.name}_name; const char *${kern.name}_name;
const char *${kern.name}_impl_names[<%len_archs=len(archs)%>${len_archs}]; const char *${kern.name} _impl_names[<% len_archs = len(archs) %> ${len_archs}];
const int ${kern.name}_impl_deps[${len_archs}]; const int ${kern.name} _impl_deps[${len_archs}];
const bool ${kern.name}_impl_alignment[${len_archs}]; const bool ${kern.name} _impl_alignment[${len_archs}];
const ${kern.pname} ${kern.name}_impls[${len_archs}]; const ${kern.pname} ${kern.name} _impls[${len_archs}];
const size_t ${kern.name}_n_impls; const size_t ${kern.name} _n_impls;
%endfor % endfor
}; };
%for machine in machines: %for machine in machines:
#ifdef LV_MACHINE_${machine.name.upper()} #ifdef LV_MACHINE_${machine.name.upper() }
extern struct volk_gnsssdr_machine volk_gnsssdr_machine_${machine.name}; extern struct volk_gnsssdr_machine volk_gnsssdr_machine_$
{
machine.name
};
#endif #endif
%endfor % endfor
__VOLK_DECL_END __VOLK_DECL_END
#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H #endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H

View File

@ -24,6 +24,6 @@
%for kern in kernels: %for kern in kernels:
typedef void (*${kern.pname})(${kern.arglist_types}); typedef void (*${kern.pname})(${kern.arglist_types});
%endfor % endfor
#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/ #endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/