1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-06-24 05:53:16 +00:00

Apply automated code formatting to volk-gnsssdr

See http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting
This commit is contained in:
Carles Fernandez 2018-03-03 12:09:45 +01:00
parent f924005733
commit 891478cf2c
75 changed files with 6642 additions and 6120 deletions

View File

@ -20,30 +20,30 @@
#include <config.h>
#endif
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
#include <volk_gnsssdr/constants.h> // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ...
#include <iostream> // for operator<<, endl, cout, ostream
#include <string> // for string
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_get_alignment, volk_gnsssdr_get_machine
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
#include <volk_gnsssdr/constants.h> // for volk_gnsssdr_available_machines, volk_gnsssdr_c_compiler ...
#include <iostream> // for operator<<, endl, cout, ostream
#include <string> // for string
void print_alignment()
{
std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl;
std::cout << "Alignment in bytes: " << volk_gnsssdr_get_alignment() << std::endl;
}
void print_malloc()
{
// You don't want to change the volk_malloc code, so just copy the if/else
// structure from there and give an explanation for the implementations
std::cout << "Used malloc implementation: ";
#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
std::cout << "posix_memalign" << std::endl;
#elif _MSC_VER >= 1400
std::cout << "aligned_malloc" << std::endl;
#else
std::cout << "No standard handler available, using own implementation." << std::endl;
#endif
// You don't want to change the volk_malloc code, so just copy the if/else
// structure from there and give an explanation for the implementations
std::cout << "Used malloc implementation: ";
#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
std::cout << "posix_memalign" << std::endl;
#elif _MSC_VER >= 1400
std::cout << "aligned_malloc" << std::endl;
#else
std::cout << "No standard handler available, using own implementation." << std::endl;
#endif
}
@ -54,22 +54,24 @@ int main(int argc, char **argv)
our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler()));
our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags()));
our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines()));
our_options.add(option_t("avail-machines", "", "print VOLK_GNSSSDR machines on the current "
"platform", volk_gnsssdr_list_machines));
our_options.add(option_t("avail-machines", "",
"print VOLK_GNSSSDR machines on the current "
"platform",
volk_gnsssdr_list_machines));
our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used",
volk_gnsssdr_get_machine()));
volk_gnsssdr_get_machine()));
our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment));
our_options.add(option_t("malloc", "", "print the malloc implementation used in volk_gnsssdr_malloc",
print_malloc));
print_malloc));
our_options.add(option_t("version", "v", "print the VOLK_GNSSSDR version", volk_gnsssdr_version()));
try
{
{
our_options.parse(argc, argv);
}
catch(...)
{
}
catch (...)
{
return 1;
}
}
return 0;
}

View File

@ -17,157 +17,182 @@
*/
#include "volk_gnsssdr_option_helpers.h"
#include <climits> // IWYU pragma: keep
#include <cstdlib> // IWYU pragma: keep
#include <cstring> // IWYU pragma: keep
#include <exception> // for exception
#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
#include <utility> // for pair
#include <climits> // IWYU pragma: keep
#include <cstdlib> // IWYU pragma: keep
#include <cstring> // IWYU pragma: keep
#include <exception> // for exception
#include <iostream> // for operator<<, endl, basic_ostream, cout, ostream
#include <utility> // for pair
/*
* Option type
*/
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)())
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback(callback) { option_type = VOID_CALLBACK; }
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback(callback) { option_type = VOID_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int))
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback((void (*)()) callback) { option_type = INT_CALLBACK; }
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback((void (*)())callback) { option_type = INT_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(float))
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback((void (*)()) callback) { option_type = FLOAT_CALLBACK; }
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback((void (*)())callback) { option_type = FLOAT_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(bool))
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback((void (*)()) callback) { option_type = BOOL_CALLBACK; }
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback((void (*)())callback) { option_type = BOOL_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(std::string))
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback((void (*)()) callback) { option_type = STRING_CALLBACK; }
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
callback((void (*)())callback) { option_type = STRING_CALLBACK; }
option_t::option_t(std::string longform, std::string shortform, std::string msg, std::string printval)
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
printval(printval) { option_type = STRING; }
: longform("--" + longform),
shortform("-" + shortform),
msg(msg),
printval(printval) { option_type = STRING; }
/*
* Option List
*/
option_list::option_list(std::string program_name) :
program_name(program_name) {
{ internal_list = std::vector<option_t>(); }
}
void option_list::add(const option_t & opt) { internal_list.push_back(opt); }
void option_list::parse(int argc, char **argv) {
for (int arg_number = 0; arg_number < argc; ++arg_number) {
for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end();
this_option++) {
if (this_option->longform == std::string(argv[arg_number]) ||
this_option->shortform == std::string(argv[arg_number])) {
switch (this_option->option_type) {
case VOID_CALLBACK:
this_option->callback();
break;
case INT_CALLBACK:
try {
int int_val = std::stoi(argv[++arg_number]);
((void (*)(int)) this_option->callback)(int_val);
} catch (std::exception &exc) {
std::cout << "An int option can only receive a number" << std::endl;
throw std::exception();
};
break;
case FLOAT_CALLBACK:
try {
int int_val = std::stof(argv[++arg_number]);
((void (*)(float)) this_option->callback)(int_val);
} catch (std::exception &exc) {
std::cout << "A float option can only receive a number" << std::endl;
throw std::exception();
};
break;
case BOOL_CALLBACK:
try {
bool int_val = (bool) std::stoi(argv[++arg_number]);
((void (*)(bool)) this_option->callback)(int_val);
} catch (std::exception &exc) {
std::cout << "A bool option can only receive 0 or 1" << std::endl;
throw std::exception();
};
break;
case STRING_CALLBACK:
try {
((void (*)(std::string)) this_option->callback)(argv[++arg_number]);
} catch (std::exception &exc) {
throw std::exception();
};
break;
case STRING:
std::cout << this_option->printval << std::endl;
break;
default:
this_option->callback();
break;
}
}
}
if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number])) {
help();
}
option_list::option_list(std::string program_name) : program_name(program_name)
{
{
internal_list = std::vector<option_t>();
}
}
void option_list::help() {
void option_list::add(const option_t &opt) { internal_list.push_back(opt); }
void option_list::parse(int argc, char **argv)
{
for (int arg_number = 0; arg_number < argc; ++arg_number)
{
for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end();
this_option++)
{
if (this_option->longform == std::string(argv[arg_number]) ||
this_option->shortform == std::string(argv[arg_number]))
{
switch (this_option->option_type)
{
case VOID_CALLBACK:
this_option->callback();
break;
case INT_CALLBACK:
try
{
int int_val = std::stoi(argv[++arg_number]);
((void (*)(int))this_option->callback)(int_val);
}
catch (std::exception &exc)
{
std::cout << "An int option can only receive a number" << std::endl;
throw std::exception();
};
break;
case FLOAT_CALLBACK:
try
{
int int_val = std::stof(argv[++arg_number]);
((void (*)(float))this_option->callback)(int_val);
}
catch (std::exception &exc)
{
std::cout << "A float option can only receive a number" << std::endl;
throw std::exception();
};
break;
case BOOL_CALLBACK:
try
{
bool int_val = (bool)std::stoi(argv[++arg_number]);
((void (*)(bool))this_option->callback)(int_val);
}
catch (std::exception &exc)
{
std::cout << "A bool option can only receive 0 or 1" << std::endl;
throw std::exception();
};
break;
case STRING_CALLBACK:
try
{
((void (*)(std::string))this_option->callback)(argv[++arg_number]);
}
catch (std::exception &exc)
{
throw std::exception();
};
break;
case STRING:
std::cout << this_option->printval << std::endl;
break;
default:
this_option->callback();
break;
}
}
}
if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number]))
{
help();
}
}
}
void option_list::help()
{
std::cout << program_name << std::endl;
std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl;
for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end();
this_option++) {
std::string help_line(" ");
if (this_option->shortform == "-") {
help_line += this_option->longform + " ";
} else {
help_line += this_option->shortform + " [ " + this_option->longform + " ]";
}
this_option++)
{
std::string help_line(" ");
if (this_option->shortform == "-")
{
help_line += this_option->longform + " ";
}
else
{
help_line += this_option->shortform + " [ " + this_option->longform + " ]";
}
switch (help_line.size() / 8) {
case 0:
help_line += "\t\t\t\t";
break;
case 1:
help_line += "\t\t\t";
break;
case 2:
help_line += "\t\t";
break;
case 3:
help_line += "\t";
break;
default:
break;
switch (help_line.size() / 8)
{
case 0:
help_line += "\t\t\t\t";
break;
case 1:
help_line += "\t\t\t";
break;
case 2:
help_line += "\t\t";
break;
case 3:
help_line += "\t";
break;
default:
break;
}
help_line += this_option->msg;
std::cout << help_line << std::endl;
}
help_line += this_option->msg;
std::cout << help_line << std::endl;
}
}

View File

@ -36,7 +36,8 @@ typedef enum
STRING,
} VOLK_OPTYPE;
class option_t {
class option_t
{
public:
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)());
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int));
@ -51,7 +52,6 @@ public:
VOLK_OPTYPE option_type;
std::string printval;
void (*callback)();
};
class option_list
@ -59,15 +59,16 @@ class option_list
public:
option_list(std::string program_name);
void add(const option_t & opt);
void add(const option_t &opt);
void parse(int argc, char **argv);
void help();
private:
std::string program_name;
std::vector<option_t> internal_list;
};
#endif //VOLK_VOLK_OPTION_HELPERS_H
#endif //VOLK_VOLK_OPTION_HELPERS_H

View File

@ -16,23 +16,22 @@
* along with GNSS-SDR. If not, see <http://www.gnu.org/licenses/>.
*/
#include "kernel_tests.h" // for init_test_list
#include "qa_utils.h" // for volk_gnsssdr_test_results_t
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
#include "kernel_tests.h" // for init_test_list
#include "qa_utils.h" // for volk_gnsssdr_test_results_t
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
#include "volk_gnsssdr_option_helpers.h" // for option_list, option_t
#include "volk_gnsssdr_profile.h"
#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path
#include <boost/filesystem/operations.hpp> // for create_directories, exists
#include <boost/filesystem/path.hpp> // for path, operator<<
#include <boost/filesystem/path_traits.hpp> // for filesystem
#include <sys/stat.h> // for stat
#include <cstddef> // for size_t
#include <iostream> // for operator<<, basic_ostream
#include <fstream> // IWYU pragma: keep
#include <map> // for map, map<>::iterator
#include <utility> // for pair
#include <vector> // for vector, vector<>::const_..
#include "volk_gnsssdr/volk_gnsssdr_prefs.h" // for volk_gnsssdr_get_config_path
#include <boost/filesystem/operations.hpp> // for create_directories, exists
#include <boost/filesystem/path.hpp> // for path, operator<<
#include <boost/filesystem/path_traits.hpp> // for filesystem
#include <sys/stat.h> // for stat
#include <cstddef> // for size_t
#include <iostream> // for operator<<, basic_ostream
#include <fstream> // IWYU pragma: keep
#include <map> // for map, map<>::iterator
#include <utility> // for pair
#include <vector> // for vector, vector<>::const_..
namespace fs = boost::filesystem;
@ -67,92 +66,112 @@ int main(int argc, char *argv[])
profile_options.add((option_t("path", "p", "Specify the volk_config path", set_volk_config)));
try
{
{
profile_options.parse(argc, argv);
}
catch(...)
{
return 1;
}
}
catch (...)
{
return 1;
}
for (int arg_number = 0; arg_number < argc; ++arg_number) {
for (int arg_number = 0; arg_number < argc; ++arg_number)
{
if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number])) {
std::string("-h") == std::string(argv[arg_number]))
{
return 0;
}
}
}
}
// Adding program options
std::ofstream json_file;
std::string config_file;
if ( json_filename != "" ) {
json_file.open( json_filename.c_str() );
}
if (json_filename != "")
{
json_file.open(json_filename.c_str());
}
if ( volk_config_path != "" ) {
config_file = volk_config_path + "/volk_config";
}
if (volk_config_path != "")
{
config_file = volk_config_path + "/volk_config";
}
// Run tests
std::vector<volk_gnsssdr_test_results_t> results;
if(update_mode) {
if( config_file != "" ) read_results(&results, config_file);
else read_results(&results);
}
if (update_mode)
{
if (config_file != "")
read_results(&results, config_file);
else
read_results(&results);
}
// Initialize the list of tests
std::vector<volk_gnsssdr_test_case_t> test_cases = init_test_list(test_params);
// Iterate through list of tests running each one
std::string substr_to_match(test_params.kernel_regex());
for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
bool regex_match = true;
for (unsigned int ii = 0; ii < test_cases.size(); ++ii)
{
bool regex_match = true;
volk_gnsssdr_test_case_t test_case = test_cases[ii];
// if the kernel name matches regex then do the test
std::string test_case_name = test_case.name();
if(test_case_name.find(substr_to_match) == std::string::npos) {
regex_match = false;
}
// if we are in update mode check if we've already got results
// if we have any, then no need to test that kernel
bool update = true;
if(update_mode) {
for(unsigned int jj=0; jj < results.size(); ++jj) {
if(results[jj].name == test_case.name() ||
results[jj].name == test_case.puppet_master_name()) {
update = false;
break;
volk_gnsssdr_test_case_t test_case = test_cases[ii];
// if the kernel name matches regex then do the test
std::string test_case_name = test_case.name();
if (test_case_name.find(substr_to_match) == std::string::npos)
{
regex_match = false;
}
}
}
if( regex_match && update ) {
try {
run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
test_case.test_parameters(), &results, test_case.puppet_master_name());
}
catch (std::string &error) {
std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl;
}
// if we are in update mode check if we've already got results
// if we have any, then no need to test that kernel
bool update = true;
if (update_mode)
{
for (unsigned int jj = 0; jj < results.size(); ++jj)
{
if (results[jj].name == test_case.name() ||
results[jj].name == test_case.puppet_master_name())
{
update = false;
break;
}
}
}
if (regex_match && update)
{
try
{
run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
test_case.test_parameters(), &results, test_case.puppet_master_name());
}
catch (std::string &error)
{
std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl;
}
}
}
}
// Output results according to provided options
if(json_filename != "") {
write_json(json_file, results);
json_file.close();
}
if (json_filename != "")
{
write_json(json_file, results);
json_file.close();
}
if(!dry_run) {
if(config_file != "") write_results(&results, false, config_file);
else write_results(&results, false);
}
else {
std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
}
if (!dry_run)
{
if (config_file != "")
write_results(&results, false, config_file);
else
write_results(&results, false);
}
else
{
std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
}
}
@ -167,51 +186,55 @@ void read_results(std::vector<volk_gnsssdr_test_results_t> *results)
void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string path)
{
struct stat buffer;
bool config_status = (stat (path.c_str(), &buffer) == 0);
bool config_status = (stat(path.c_str(), &buffer) == 0);
if( config_status ) {
// a config exists and we are reading results from it
std::ifstream config(path.c_str());
char config_line[256];
while(config.getline(config_line, 255)) {
// tokenize the input line by kernel_name unaligned aligned
// then push back in the results vector with fields filled in
if (config_status)
{
// a config exists and we are reading results from it
std::ifstream config(path.c_str());
char config_line[256];
while (config.getline(config_line, 255))
{
// tokenize the input line by kernel_name unaligned aligned
// then push back in the results vector with fields filled in
std::vector<std::string> single_kernel_result;
std::string config_str(config_line);
std::size_t str_size = config_str.size();
std::size_t found = 1;
std::vector<std::string> single_kernel_result;
std::string config_str(config_line);
std::size_t str_size = config_str.size();
std::size_t found = 1;
found = config_str.find(' ');
// Split line by spaces
while(found && found < str_size) {
found = config_str.find(' ');
// kernel names MUST be less than 128 chars, which is
// a length restricted by volk/volk_prefs.c
// on the last token in the parsed string we won't find a space
// so make sure we copy at most 128 chars.
if(found > 127) {
found = 127;
}
str_size = config_str.size();
char buffer[128] = {'\0'};
config_str.copy(buffer, found + 1, 0);
buffer[found] = '\0';
single_kernel_result.push_back(std::string(buffer));
config_str.erase(0, found+1);
}
// Split line by spaces
while (found && found < str_size)
{
found = config_str.find(' ');
// kernel names MUST be less than 128 chars, which is
// a length restricted by volk/volk_prefs.c
// on the last token in the parsed string we won't find a space
// so make sure we copy at most 128 chars.
if (found > 127)
{
found = 127;
}
str_size = config_str.size();
char buffer[128] = {'\0'};
config_str.copy(buffer, found + 1, 0);
buffer[found] = '\0';
single_kernel_result.push_back(std::string(buffer));
config_str.erase(0, found + 1);
}
if(single_kernel_result.size() == 3) {
volk_gnsssdr_test_results_t kernel_result;
kernel_result.name = std::string(single_kernel_result[0]);
kernel_result.config_name = std::string(single_kernel_result[0]);
kernel_result.best_arch_u = std::string(single_kernel_result[1]);
kernel_result.best_arch_a = std::string(single_kernel_result[2]);
results->push_back(kernel_result);
}
if (single_kernel_result.size() == 3)
{
volk_gnsssdr_test_results_t kernel_result;
kernel_result.name = std::string(single_kernel_result[0]);
kernel_result.config_name = std::string(single_kernel_result[0]);
kernel_result.best_arch_u = std::string(single_kernel_result[1]);
kernel_result.best_arch_a = std::string(single_kernel_result[2]);
results->push_back(kernel_result);
}
}
}
}
}
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result)
@ -219,7 +242,7 @@ void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool
char path[1024];
volk_gnsssdr_get_config_path(path);
write_results( results, update_result, std::string(path));
write_results(results, update_result, std::string(path));
}
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result, const std::string path)
@ -227,39 +250,44 @@ void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool
const fs::path config_path(path);
// Until we can update the config on a kernel by kernel basis
// do not overwrite volk_gnsssdr_config when using a regex.
if (! fs::exists(config_path.branch_path()))
{
std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl;
fs::create_directories(config_path.branch_path());
}
if (!fs::exists(config_path.branch_path()))
{
std::cout << "Creating " << config_path.branch_path() << " ..." << std::endl;
fs::create_directories(config_path.branch_path());
}
std::ofstream config;
if(update_result) {
std::cout << "Updating " << path << " ..." << std::endl;
config.open(path.c_str(), std::ofstream::app);
if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << path << std::endl;
}
}
else {
std::cout << "Writing " << path << " ..." << std::endl;
config.open(path.c_str());
if (!config.is_open()) { //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << path << std::endl;
if (update_result)
{
std::cout << "Updating " << path << " ..." << std::endl;
config.open(path.c_str(), std::ofstream::app);
if (!config.is_open())
{ //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << path << std::endl;
}
}
else
{
std::cout << "Writing " << path << " ..." << std::endl;
config.open(path.c_str());
if (!config.is_open())
{ //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << path << std::endl;
}
config << "\
config << "\
#this file is generated by volk_gnsssdr_profile.\n\
#the function name is followed by the preferred architecture.\n\
";
}
}
std::vector<volk_gnsssdr_test_results_t>::const_iterator profile_results;
for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) {
config << profile_results->config_name << " "
<< profile_results->best_arch_a << " "
<< profile_results->best_arch_u << std::endl;
}
for (profile_results = results->begin(); profile_results != results->end(); ++profile_results)
{
config << profile_results->config_name << " "
<< profile_results->best_arch_a << " "
<< profile_results->best_arch_u << std::endl;
}
config.close();
}
@ -270,43 +298,45 @@ void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_
size_t len = results.size();
size_t i = 0;
std::vector<volk_gnsssdr_test_results_t>::iterator result;
for(result = results.begin(); result != results.end(); ++result) {
json_file << " {" << std::endl;
json_file << " \"name\": \"" << result->name << "\"," << std::endl;
json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
json_file << " \"iter\": " << result->iter << "," << std::endl;
json_file << " \"best_arch_a\": \"" << result->best_arch_a
<< "\"," << std::endl;
json_file << " \"best_arch_u\": \"" << result->best_arch_u
<< "\"," << std::endl;
json_file << " \"results\": {" << std::endl;
size_t results_len = result->results.size();
size_t ri = 0;
for (result = results.begin(); result != results.end(); ++result)
{
json_file << " {" << std::endl;
json_file << " \"name\": \"" << result->name << "\"," << std::endl;
json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
json_file << " \"iter\": " << result->iter << "," << std::endl;
json_file << " \"best_arch_a\": \"" << result->best_arch_a
<< "\"," << std::endl;
json_file << " \"best_arch_u\": \"" << result->best_arch_u
<< "\"," << std::endl;
json_file << " \"results\": {" << std::endl;
size_t results_len = result->results.size();
size_t ri = 0;
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) {
volk_gnsssdr_test_time_t time = kernel_time_pair->second;
json_file << " \"" << time.name << "\": {" << std::endl;
json_file << " \"name\": \"" << time.name << "\"," << std::endl;
json_file << " \"time\": " << time.time << "," << std::endl;
json_file << " \"units\": \"" << time.units << "\"" << std::endl;
json_file << " }" ;
if(ri+1 != results_len) {
json_file << ",";
}
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
for (kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair)
{
volk_gnsssdr_test_time_t time = kernel_time_pair->second;
json_file << " \"" << time.name << "\": {" << std::endl;
json_file << " \"name\": \"" << time.name << "\"," << std::endl;
json_file << " \"time\": " << time.time << "," << std::endl;
json_file << " \"units\": \"" << time.units << "\"" << std::endl;
json_file << " }";
if (ri + 1 != results_len)
{
json_file << ",";
}
json_file << std::endl;
ri++;
}
json_file << " }" << std::endl;
json_file << " }";
if (i + 1 != len)
{
json_file << ",";
}
json_file << std::endl;
ri++;
i++;
}
json_file << " }" << std::endl;
json_file << " }";
if(i+1 != len) {
json_file << ",";
}
json_file << std::endl;
i++;
}
json_file << " ]" << std::endl;
json_file << "}" << std::endl;
}

View File

@ -27,10 +27,10 @@
* -------------------------------------------------------------------------
*/
#include <cstdbool> // for bool
#include <iosfwd> // for ofstream
#include <string> // for string
#include <vector> // for vector
#include <cstdbool> // for bool
#include <iosfwd> // for ofstream
#include <string> // for string
#include <vector> // for vector
class volk_test_results_t;

View File

@ -29,7 +29,7 @@
static inline int16_t sat_adds16i(int16_t x, int16_t y)
{
int32_t res = (int32_t) x + (int32_t) y;
int32_t res = (int32_t)x + (int32_t)y;
if (res < SHRT_MIN) res = SHRT_MIN;
if (res > SHRT_MAX) res = SHRT_MAX;
@ -39,7 +39,7 @@ static inline int16_t sat_adds16i(int16_t x, int16_t y)
static inline int16_t sat_muls16i(int16_t x, int16_t y)
{
int32_t res = (int32_t) x * (int32_t) y;
int32_t res = (int32_t)x * (int32_t)y;
if (res < SHRT_MIN) res = SHRT_MIN;
if (res > SHRT_MAX) res = SHRT_MAX;

View File

@ -30,38 +30,42 @@
static inline __m256
_mm256_complexmul_ps(__m256 x, __m256 y)
{
__m256 yl, yh, tmp1, tmp2;
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
__m256 yl, yh, tmp1, tmp2;
yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br ...
tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
return _mm256_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
}
static inline __m256
_mm256_conjugate_ps(__m256 x){
const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
return _mm256_xor_ps(x, conjugator); // conjugate y
_mm256_conjugate_ps(__m256 x)
{
const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
return _mm256_xor_ps(x, conjugator); // conjugate y
}
static inline __m256
_mm256_complexconjugatemul_ps(__m256 x, __m256 y){
y = _mm256_conjugate_ps(y);
return _mm256_complexmul_ps(x, y);
_mm256_complexconjugatemul_ps(__m256 x, __m256 y)
{
y = _mm256_conjugate_ps(y);
return _mm256_complexmul_ps(x, y);
}
static inline __m256
_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
__m256 complex1, complex2;
cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
{
__m256 complex1, complex2;
cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
}
static inline __m256 _mm256_complexnormalise_ps( __m256 z ){
static inline __m256 _mm256_complexnormalise_ps(__m256 z)
{
__m256 tmp1 = _mm256_mul_ps(z, z);
__m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
@ -70,8 +74,9 @@ static inline __m256 _mm256_complexnormalise_ps( __m256 z ){
}
static inline __m256
_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){
return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
{
return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
}
#endif /* INCLUDE_VOLK_VOLK_AVX_INTRINSICS_H_ */

View File

@ -28,14 +28,14 @@
// Cross-platform attribute macros not included in VOLK
////////////////////////////////////////////////////////////////////////
#if defined __GNUC__
# define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr)
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
#define __VOLK_GNSSSDR_PREFETCH(addr) __builtin_prefetch(addr)
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality) __builtin_prefetch(addr, rw, locality)
#elif _MSC_VER
# define __VOLK_GNSSSDR_PREFETCH(addr)
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
#define __VOLK_GNSSSDR_PREFETCH(addr)
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
#else
# define __VOLK_GNSSSDR_PREFETCH(addr)
# define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
#define __VOLK_GNSSSDR_PREFETCH(addr)
#define __VOLK_GNSSSDR_PREFETCH_LOCALITY(addr, rw, locality)
#endif
#ifndef INCLUDED_LIBVOLK_COMMON_H
@ -45,45 +45,45 @@
// Cross-platform attribute macros
////////////////////////////////////////////////////////////////////////
#if defined __GNUC__
# define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
# define __VOLK_ATTR_UNUSED __attribute__((unused))
# define __VOLK_ATTR_INLINE __attribute__((always_inline))
# define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
# define __VOLK_ASM __asm__
# define __VOLK_VOLATILE __volatile__
# if __GNUC__ >= 4
# define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
# define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
# else
# define __VOLK_ATTR_EXPORT
# define __VOLK_ATTR_IMPORT
# endif
#elif _MSC_VER
# define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
# define __VOLK_ATTR_UNUSED
# define __VOLK_ATTR_INLINE __forceinline
# define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
# define __VOLK_ATTR_EXPORT __declspec(dllexport)
# define __VOLK_ATTR_IMPORT __declspec(dllimport)
# define __VOLK_ASM __asm
# define __VOLK_VOLATILE
#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
#define __VOLK_ATTR_UNUSED __attribute__((unused))
#define __VOLK_ATTR_INLINE __attribute__((always_inline))
#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
#define __VOLK_ASM __asm__
#define __VOLK_VOLATILE __volatile__
#if __GNUC__ >= 4
#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
#else
# define __VOLK_ATTR_ALIGNED(x)
# define __VOLK_ATTR_UNUSED
# define __VOLK_ATTR_INLINE
# define __VOLK_ATTR_DEPRECATED
# define __VOLK_ATTR_EXPORT
# define __VOLK_ATTR_IMPORT
# define __VOLK_ASM __asm__
# define __VOLK_VOLATILE __volatile__
#define __VOLK_ATTR_EXPORT
#define __VOLK_ATTR_IMPORT
#endif
#elif _MSC_VER
#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
#define __VOLK_ATTR_UNUSED
#define __VOLK_ATTR_INLINE __forceinline
#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
#define __VOLK_ATTR_EXPORT __declspec(dllexport)
#define __VOLK_ATTR_IMPORT __declspec(dllimport)
#define __VOLK_ASM __asm
#define __VOLK_VOLATILE
#else
#define __VOLK_ATTR_ALIGNED(x)
#define __VOLK_ATTR_UNUSED
#define __VOLK_ATTR_INLINE
#define __VOLK_ATTR_DEPRECATED
#define __VOLK_ATTR_EXPORT
#define __VOLK_ATTR_IMPORT
#define __VOLK_ASM __asm__
#define __VOLK_VOLATILE __volatile__
#endif
////////////////////////////////////////////////////////////////////////
// Ignore annoying warnings in MSVC
////////////////////////////////////////////////////////////////////////
#if defined(_MSC_VER)
# pragma warning(disable: 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
# pragma warning(disable: 4305) //'identifier' : truncation from 'type1' to 'type2'
#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2', possible loss of data
#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
#endif
////////////////////////////////////////////////////////////////////////
@ -91,11 +91,13 @@
// FIXME: due to the usage of complex.h, require gcc for c-linkage
////////////////////////////////////////////////////////////////////////
#if defined(__cplusplus) && (__GNUC__)
# define __VOLK_DECL_BEGIN extern "C" {
# define __VOLK_DECL_END }
#define __VOLK_DECL_BEGIN \
extern "C" \
{
#define __VOLK_DECL_END }
#else
# define __VOLK_DECL_BEGIN
# define __VOLK_DECL_END
#define __VOLK_DECL_BEGIN
#define __VOLK_DECL_END
#endif
////////////////////////////////////////////////////////////////////////
@ -103,9 +105,9 @@
// http://gcc.gnu.org/wiki/Visibility
////////////////////////////////////////////////////////////////////////
#ifdef volk_gnsssdr_EXPORTS
# define VOLK_API __VOLK_ATTR_EXPORT
#define VOLK_API __VOLK_ATTR_EXPORT
#else
# define VOLK_API __VOLK_ATTR_IMPORT
#define VOLK_API __VOLK_ATTR_IMPORT
#endif
////////////////////////////////////////////////////////////////////////
@ -121,35 +123,37 @@
#endif
#endif
union bit128{
uint8_t i8[16];
uint16_t i16[8];
uint32_t i[4];
float f[4];
double d[2];
union bit128
{
uint8_t i8[16];
uint16_t i16[8];
uint32_t i[4];
float f[4];
double d[2];
#ifdef LV_HAVE_SSE
__m128 float_vec;
#endif
#ifdef LV_HAVE_SSE
__m128 float_vec;
#endif
#ifdef LV_HAVE_SSE2
__m128i int_vec;
__m128d double_vec;
#endif
#ifdef LV_HAVE_SSE2
__m128i int_vec;
__m128d double_vec;
#endif
};
union bit256{
uint8_t i8[32];
uint16_t i16[16];
uint32_t i[8];
float f[8];
double d[4];
union bit256
{
uint8_t i8[32];
uint16_t i16[16];
uint32_t i[8];
float f[8];
double d[4];
#ifdef LV_HAVE_AVX
__m256 float_vec;
__m256i int_vec;
__m256d double_vec;
#endif
#ifdef LV_HAVE_AVX
__m256 float_vec;
__m256i int_vec;
__m256d double_vec;
#endif
};
#define bit128_p(x) ((union bit128 *)(x))

View File

@ -48,26 +48,34 @@
#include <complex>
#include <stdint.h>
typedef std::complex<int8_t> lv_8sc_t;
typedef std::complex<int8_t> lv_8sc_t;
typedef std::complex<int16_t> lv_16sc_t;
typedef std::complex<int32_t> lv_32sc_t;
typedef std::complex<int64_t> lv_64sc_t;
typedef std::complex<float> lv_32fc_t;
typedef std::complex<double> lv_64fc_t;
typedef std::complex<float> lv_32fc_t;
typedef std::complex<double> lv_64fc_t;
template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){
template <typename T>
inline std::complex<T> lv_cmake(const T &r, const T &i)
{
return std::complex<T>(r, i);
}
template <typename T> inline typename T::value_type lv_creal(const T &x){
template <typename T>
inline typename T::value_type lv_creal(const T &x)
{
return x.real();
}
template <typename T> inline typename T::value_type lv_cimag(const T &x){
template <typename T>
inline typename T::value_type lv_cimag(const T &x)
{
return x.imag();
}
template <typename T> inline T lv_conj(const T &x){
template <typename T>
inline T lv_conj(const T &x)
{
return std::conj(x);
}
@ -80,14 +88,14 @@ template <typename T> inline T lv_conj(const T &x){
#include <complex.h>
typedef char complex lv_8sc_t;
typedef short complex lv_16sc_t;
typedef long complex lv_32sc_t;
typedef long long complex lv_64sc_t;
typedef float complex lv_32fc_t;
typedef double complex lv_64fc_t;
typedef char complex lv_8sc_t;
typedef short complex lv_16sc_t;
typedef long complex lv_32sc_t;
typedef long long complex lv_64sc_t;
typedef float complex lv_32fc_t;
typedef double complex lv_64fc_t;
#define lv_cmake(r, i) ((r) + _Complex_I*(i))
#define lv_cmake(r, i) ((r) + _Complex_I * (i))
// When GNUC is available, use the complex extensions.
// The extensions always return the correct value type.

View File

@ -27,30 +27,30 @@
#include <arm_neon.h>
static inline float32x4_t vdivq_f32( float32x4_t num, float32x4_t den )
static inline float32x4_t vdivq_f32(float32x4_t num, float32x4_t den)
{
const float32x4_t q_inv0 = vrecpeq_f32( den );
const float32x4_t q_step0 = vrecpsq_f32( q_inv0, den );
const float32x4_t q_inv0 = vrecpeq_f32(den);
const float32x4_t q_step0 = vrecpsq_f32(q_inv0, den);
const float32x4_t q_inv1 = vmulq_f32( q_step0, q_inv0 );
return vmulq_f32( num, q_inv1 );
const float32x4_t q_inv1 = vmulq_f32(q_step0, q_inv0);
return vmulq_f32(num, q_inv1);
}
static inline float32x4_t vsqrtq_f32( float32x4_t q_x )
static inline float32x4_t vsqrtq_f32(float32x4_t q_x)
{
const float32x4_t q_step_0 = vrsqrteq_f32( q_x );
const float32x4_t q_step_0 = vrsqrteq_f32(q_x);
// step
const float32x4_t q_step_parm0 = vmulq_f32( q_x, q_step_0 );
const float32x4_t q_step_result0 = vrsqrtsq_f32( q_step_parm0, q_step_0 );
const float32x4_t q_step_parm0 = vmulq_f32(q_x, q_step_0);
const float32x4_t q_step_result0 = vrsqrtsq_f32(q_step_parm0, q_step_0);
// step
const float32x4_t q_step_1 = vmulq_f32( q_step_0, q_step_result0 );
const float32x4_t q_step_parm1 = vmulq_f32( q_x, q_step_1 );
const float32x4_t q_step_result1 = vrsqrtsq_f32( q_step_parm1, q_step_1 );
const float32x4_t q_step_1 = vmulq_f32(q_step_0, q_step_result0);
const float32x4_t q_step_parm1 = vmulq_f32(q_x, q_step_1);
const float32x4_t q_step_result1 = vrsqrtsq_f32(q_step_parm1, q_step_1);
// take the res
const float32x4_t q_step_2 = vmulq_f32( q_step_1, q_step_result1 );
const float32x4_t q_step_2 = vmulq_f32(q_step_1, q_step_result1);
// mul by x to get sqrt, not rsqrt
return vmulq_f32( q_x, q_step_2 );
return vmulq_f32(q_x, q_step_2);
}
#endif /* INCLUDED_VOLK_GNSSSDR_NEON_INTRINSICS_H_ */

View File

@ -32,9 +32,9 @@ __VOLK_DECL_BEGIN
typedef struct volk_gnsssdr_arch_pref
{
char name[128]; //name of the kernel
char impl_a[128]; //best aligned impl
char impl_u[128]; //best unaligned impl
char name[128]; //name of the kernel
char impl_a[128]; //best aligned impl
char impl_u[128]; //best unaligned impl
} volk_gnsssdr_arch_pref_t;
////////////////////////////////////////////////////////////////////////

View File

@ -30,33 +30,35 @@
static inline __m128
_mm_complexmul_ps(__m128 x, __m128 y)
{
__m128 yl, yh, tmp1, tmp2;
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
__m128 yl, yh, tmp1, tmp2;
yl = _mm_moveldup_ps(y); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(y); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
return _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
}
static inline __m128
_mm_complexconjugatemul_ps(__m128 x, __m128 y)
{
const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
y = _mm_xor_ps(y, conjugator); // conjugate y
return _mm_complexmul_ps(x, y);
const __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
y = _mm_xor_ps(y, conjugator); // conjugate y
return _mm_complexmul_ps(x, y);
}
static inline __m128
_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
{
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
}
static inline __m128
_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){
return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
{
return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
}
#endif /* INCLUDE_VOLK_VOLK_SSE3_INTRINSICS_H_ */

View File

@ -27,20 +27,22 @@
#include <xmmintrin.h>
static inline __m128
_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){
__m128 iValue, qValue;
// Arrange in i1i2i3i4 format
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2,0,2,0));
// Arrange in q1q2q3q4 format
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
iValue = _mm_mul_ps(iValue, iValue); // Square the I values
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
{
__m128 iValue, qValue;
// Arrange in i1i2i3i4 format
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
// Arrange in q1q2q3q4 format
qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
iValue = _mm_mul_ps(iValue, iValue); // Square the I values
qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
}
static inline __m128
_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){
return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
{
return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
}
#endif /* INCLUDED_VOLK_VOLK_SSE_INTRINSICS_H_ */

View File

@ -45,55 +45,55 @@
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_generic(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
unsigned int n;
float rem_code_phase_chips = -0.234;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16i_xn_resampler_16i_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -103,26 +103,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse3(int16_t* result
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -133,26 +133,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse3(int16_t* result
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -163,26 +163,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_sse4_1(int16_t* resu
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -193,26 +193,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_sse4_1(int16_t* resu
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -223,26 +223,26 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_u_avx(int16_t* result,
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -253,30 +253,29 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_a_avx(int16_t* result,
static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result, const int16_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
int16_t** result_aux = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16i_xn_resampler_16i_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((int16_t*)result, (int16_t*)result_aux[0], sizeof(int16_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H
#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H

View File

@ -107,7 +107,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -121,7 +122,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -139,13 +140,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -157,7 +158,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
}
}
#endif
#endif
#ifdef LV_HAVE_SSE4_1
@ -173,7 +174,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -187,7 +189,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -205,13 +207,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -240,7 +242,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -254,7 +257,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -275,13 +278,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -310,7 +313,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -324,7 +328,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -345,13 +349,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -379,7 +383,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -394,7 +399,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -412,13 +417,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -428,7 +433,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -456,7 +461,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -471,7 +477,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -489,13 +495,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -505,7 +511,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -531,7 +537,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0);
@ -539,11 +546,12 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec);
int current_correlator_tap;
unsigned int n;
@ -553,7 +561,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < neon_iters; n++)
for (n = 0; n < neon_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
@ -569,7 +577,7 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
// fmod
c = vmulq_f32(aux, reciprocal);
i = vcvtq_s32_f32(c);
i = vcvtq_s32_f32(c);
cTrunc = vcvtq_f32_s32(i);
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
aux = vsubq_f32(aux, base);
@ -581,13 +589,13 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = vaddq_f32(indexn, fours);
}
for(n = neon_iters * 4; n < num_points; n++)
for (n = neon_iters * 4; n < num_points; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap
@ -605,4 +613,3 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
#endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/

View File

@ -41,7 +41,7 @@
#include <string.h>
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345;
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv
unsigned int n;
int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
}
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic(lv
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345;
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_generic_re
unsigned int n;
int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
}
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const int16_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -113,50 +113,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3(lv_
unsigned int n;
int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
}
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // SSE3
#endif // SSE3
//#ifdef LV_HAVE_SSE3
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_sse3_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // SSE3
@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_sse3(lv_
unsigned int n;
int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
}
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // SSE3
#endif // SSE3
#ifdef LV_HAVE_AVX2
@ -206,50 +206,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2(lv_
unsigned int n;
int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
}
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // AVX2
#endif // AVX2
//#ifdef LV_HAVE_AVX2
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_a_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // AVX2
@ -268,50 +268,50 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
unsigned int n;
int num_a_vectors = 3;
int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
}
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(result, local_code, phase_inc[0], phase, (const int16_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // AVX2
#endif // AVX2
//#ifdef LV_HAVE_AVX2
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // AVX2
@ -320,29 +320,29 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
//#ifdef LV_HAVE_NEON
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // NEON
@ -351,34 +351,31 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
//#ifdef LV_HAVE_NEON
//static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
//{
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//// phases must be normalized. Phase rotator expects a complex exponential input!
//float rem_carrier_phase_in_rad = 0.345;
//float phase_step_rad = 0.1;
//lv_32fc_t phase[1];
//phase[0] = lv_cmake(cos(rem_carrier_phase_in_rad), sin(rem_carrier_phase_in_rad));
//lv_32fc_t phase_inc[1];
//phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
//unsigned int n;
//int num_a_vectors = 3;
//int16_t** in_a = (int16_t**)volk_gnsssdr_malloc(sizeof(int16_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
//for(n = 0; n < num_a_vectors; n++)
//{
//in_a[n] = (int16_t*)volk_gnsssdr_malloc(sizeof(int16_t) * num_points, volk_gnsssdr_get_alignment());
//memcpy((int16_t*)in_a[n], (int16_t*)in, sizeof(int16_t) * num_points);
//}
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const int16_t**) in_a, num_a_vectors, num_points);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//for(n = 0; n < num_a_vectors; n++)
//{
//volk_gnsssdr_free(in_a[n]);
//}
//volk_gnsssdr_free(in_a);
//}
//#endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H

View File

@ -68,7 +68,7 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_generic(lv_16sc_t* cVector,
const lv_16sc_t* aPtr = aVector;
unsigned int number;
for(number = 0; number < num_points; number++)
for (number = 0; number < num_points; number++)
{
*cPtr++ = lv_conj(*aPtr++);
}
@ -231,4 +231,3 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c
//#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */

View File

@ -63,7 +63,7 @@
static inline void volk_gnsssdr_16ic_convert_32fc_generic(lv_32fc_t* outputVector, const lv_16sc_t* inputVector, unsigned int num_points)
{
unsigned int i;
for(i = 0; i < num_points; i++)
for (i = 0; i < num_points; i++)
{
outputVector[i] = lv_cmake((float)lv_creal(inputVector[i]), (float)lv_cimag(inputVector[i]));
}
@ -82,9 +82,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_sse2(lv_32fc_t* outputVector
lv_32fc_t* _out = outputVector;
__m128 a;
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm_store_ps((float*)_out, a);
_in += 2;
_out += 2;
@ -109,9 +109,9 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_sse2(lv_32fc_t* outputVector
lv_32fc_t* _out = outputVector;
__m128 a;
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm_storeu_ps((float*)_out, a);
_in += 2;
_out += 2;
@ -136,15 +136,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_u_axv(lv_32fc_t* outputVector,
lv_32fc_t* _out = outputVector;
__m256 a;
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm256_storeu_ps((float*)_out, a);
_in += 4;
_out += 4;
}
_mm256_zeroupper();
for(i = 0; i < (num_points % 4); ++i)
for (i = 0; i < (num_points % 4); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++;
@ -163,15 +163,15 @@ static inline void volk_gnsssdr_16ic_convert_32fc_a_axv(lv_32fc_t* outputVector,
lv_32fc_t* _out = outputVector;
__m256 a;
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm256_set_ps((float)(lv_cimag(_in[3])), (float)(lv_creal(_in[3])), (float)(lv_cimag(_in[2])), (float)(lv_creal(_in[2])), (float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
_mm256_store_ps((float*)_out, a);
_in += 4;
_out += 4;
}
_mm256_zeroupper();
for(i = 0; i < (num_points % 4); ++i)
for (i = 0; i < (num_points % 4); ++i)
{
*_out++ = lv_cmake((float)lv_creal(*_in), (float)lv_cimag(*_in));
_in++;
@ -194,7 +194,7 @@ static inline void volk_gnsssdr_16ic_convert_32fc_neon(lv_32fc_t* outputVector,
int32x4_t a32x4;
float32x4_t f32x4;
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
a16x4 = vld1_s16((const int16_t*)_in);
__VOLK_GNSSSDR_PREFETCH(_in + 4);

View File

@ -78,7 +78,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
// resample code for current tap
local_code_chip_index = round(code_phase_step_chips * (float)n + rem_code_phase_chips - 0.5f);
if (local_code_chip_index < 0.0) local_code_chip_index += code_length_chips;
if (local_code_chip_index > (code_length_chips-1)) local_code_chip_index -= code_length_chips;
if (local_code_chip_index > (code_length_chips - 1)) local_code_chip_index -= code_length_chips;
result[n] = local_code[local_code_chip_index];
}
}
@ -89,61 +89,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_generic(lv_16sc_t* resu
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float)
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
{
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
__m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips-1;
four_times_code_length_chips_minus1[1] = code_length_chips-1;
four_times_code_length_chips_minus1[2] = code_length_chips-1;
four_times_code_length_chips_minus1[3] = code_length_chips-1;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_load_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
for(number = 0; number < quarterPoints; number++)
for (number = 0; number < quarterPoints; number++)
{
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]];
@ -154,7 +159,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
_4output_index = _mm_add_ps(_4output_index, _4constant_float);
}
for(number = quarterPoints * 4; number < num_output_samples; number++)
for (number = quarterPoints * 4; number < num_output_samples; number++)
{
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
@ -169,61 +174,66 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float)
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
{
_MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
__m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips-1;
four_times_code_length_chips_minus1[1] = code_length_chips-1;
four_times_code_length_chips_minus1[2] = code_length_chips-1;
four_times_code_length_chips_minus1[3] = code_length_chips-1;
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_loadu_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
for(number = 0; number < quarterPoints; number++)
for (number = 0; number < quarterPoints; number++)
{
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]];
@ -234,7 +244,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
_4output_index = _mm_add_ps(_4output_index, _4constant_float);
}
for(number = quarterPoints * 4; number < num_output_samples; number++)
for (number = quarterPoints * 4; number < num_output_samples; number++)
{
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
@ -249,7 +259,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples)//, int* scratch_buffer, float* scratch_buffer_float)
static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, float rem_code_phase_chips, float code_phase_step_chips, int code_length_chips, unsigned int num_output_samples) //, int* scratch_buffer, float* scratch_buffer_float)
{
unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4;
@ -257,57 +267,62 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float32x4_t _rem_code_phase, _code_phase_step_chips;
int32x4_t _code_length_chips, _code_length_chips_minus1;
float32x4_t _code_phase_out, _code_phase_out_with_offset;
rem_code_phase_chips = rem_code_phase_chips - 0.5f;
float32x4_t sign, PlusHalf, Round;
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
uint32x4_t negative_indexes, overflow_indexes;
int32x4_t zero = vmovq_n_s32(0);
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
float32x4_t _4output_index = vld1q_f32(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
for(number = 0; number < quarterPoints; number++)
for (number = 0; number < quarterPoints; number++)
{
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
Round = vsubq_f32(PlusHalf, sign);
_code_phase_out_int = vcvtq_s32_f32(Round);
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int )));
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg )));
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible
*_result++ = local_code[local_code_chip_index[0]];
@ -318,7 +333,7 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
_4output_index = vaddq_f32(_4output_index, _4constant_float);
}
for(number = quarterPoints * 4; number < num_output_samples; number++)
for (number = quarterPoints * 4; number < num_output_samples; number++)
{
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)number + rem_code_phase_chips + 0.5f);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;

View File

@ -44,7 +44,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_generic(lv_16sc_t*
float rem_code_phase_chips = -0.123;
float code_phase_step_chips = 0.1;
int code_length_chips = 1023;
volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
volk_gnsssdr_16ic_resampler_fast_16ic_generic(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
}
#endif /* LV_HAVE_GENERIC */
@ -55,7 +55,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_a_sse2(lv_16sc_t*
float rem_code_phase_chips = -0.123;
float code_phase_step_chips = 0.1;
int code_length_chips = 1023;
volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points );
volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
}
#endif /* LV_HAVE_SSE2 */
@ -67,7 +67,7 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_u_sse2(lv_16sc_t*
float rem_code_phase_chips = -0.123;
float code_phase_step_chips = 0.1;
int code_length_chips = 1023;
volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points );
volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
}
#endif /* LV_HAVE_SSE2 */
@ -79,9 +79,9 @@ static inline void volk_gnsssdr_16ic_resamplerfastpuppet_16ic_neon(lv_16sc_t* re
float rem_code_phase_chips = -0.123;
float code_phase_step_chips = 0.1;
int code_length_chips = 1023;
volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points );
volk_gnsssdr_16ic_resampler_fast_16ic_neon(result, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_points);
}
#endif /* LV_HAVE_NEON */
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerfastpuppet_16ic_H

View File

@ -49,21 +49,21 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_generic(lv_16sc_
int num_out_vectors = 3;
unsigned int n;
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -77,22 +77,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_a_sse2(lv_16sc_t
int code_length_chips = 2046;
int num_out_vectors = 3;
unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -106,22 +106,22 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_u_sse2(lv_16sc_t
int code_length_chips = 2046;
int num_out_vectors = 3;
unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -135,26 +135,26 @@ static inline void volk_gnsssdr_16ic_resamplerfastxnpuppet_16ic_neon(lv_16sc_t*
int code_length_chips = 2046;
int num_out_vectors = 3;
unsigned int n;
float * rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float* rem_code_phase_chips = (float*)volk_gnsssdr_malloc(sizeof(float) * num_out_vectors, volk_gnsssdr_get_alignment());
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
rem_code_phase_chips[n] = -0.234;
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, code_length_chips, num_out_vectors, num_points);
memcpy(result, result_aux[0], sizeof(lv_16sc_t) * num_points);
volk_gnsssdr_free(rem_code_phase_chips);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H

View File

@ -45,56 +45,56 @@
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
unsigned int n;
float rem_code_phase_chips = -0.234;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_16ic_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -104,26 +104,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse3(lv_16sc_t* re
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -134,26 +134,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse3(lv_16sc_t* re
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -164,26 +164,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_sse4_1(lv_16sc_t*
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -194,26 +194,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_sse4_1(lv_16sc_t*
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -224,26 +224,26 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_u_avx(lv_16sc_t* res
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -254,29 +254,29 @@ static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_a_avx(lv_16sc_t* res
static inline void volk_gnsssdr_16ic_resamplerxnpuppet_16ic_neon(lv_16sc_t* result, const lv_16sc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_16sc_t** result_aux = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_16sc_t*)result, (lv_16sc_t*)result_aux[0], sizeof(lv_16sc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H
#endif // INCLUDED_volk_gnsssdr_16ic_resamplerpuppet_16ic_H

View File

@ -70,7 +70,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic(lv_16sc_t* ou
unsigned int i = 0;
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for(i = 0; i < (unsigned int)(num_points); ++i)
for (i = 0; i < (unsigned int)(num_points); ++i)
{
tmp16 = *inVector++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
@ -111,8 +111,8 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_generic_reload(lv_16s
*outVector++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
}
// Regenerate phase
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
// Regenerate phase
//printf("Phase before regeneration %i: %f,%f Modulus: %f\n", n,lv_creal(*phase),lv_cimag(*phase), cabsf(*phase));
#ifdef __cplusplus
(*phase) /= std::abs((*phase));
#else
@ -141,11 +141,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
unsigned int number;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -157,49 +159,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result);
// Regenerate phase
@ -232,7 +234,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
@ -244,11 +245,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -265,47 +268,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
{
for (j = 0; j < ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result);
//next two samples
@ -322,47 +325,47 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_store_si128((__m128i*)_out, result);
//next two samples
@ -385,7 +388,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
#endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
@ -395,14 +397,16 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
unsigned int number;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
const lv_16sc_t* _in = inVector;
@ -412,49 +416,49 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result);
// Regenerate phase
@ -493,147 +497,149 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc_t* outVector, const lv_16sc_t* inVector, const lv_32fc_t phase_inc, lv_32fc_t* phase, unsigned int num_points)
{
const unsigned int sse_iters = num_points / 4;
unsigned int ROTATOR_RELOAD = 512;
unsigned int n;
unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*) two_phase_acc);
unsigned int ROTATOR_RELOAD = 512;
unsigned int n;
unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
const lv_16sc_t* _in = inVector;
const lv_16sc_t* _in = inVector;
lv_16sc_t* _out = outVector;
lv_16sc_t* _out = outVector;
__m128 yl, yh, tmp1, tmp2, tmp3;
lv_16sc_t tmp16;
lv_32fc_t tmp32;
__m128 yl, yh, tmp1, tmp2, tmp3;
lv_16sc_t tmp16;
lv_32fc_t tmp32;
for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
{
for (j = 0; j < ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
for (n = 0; n < sse_iters / ROTATOR_RELOAD; n++)
{
for (j = 0; j < ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result);
// store four output samples
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result);
//next two samples
_in += 2;
_out += 4;
}
// Regenerate phase
tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
tmp2 = _mm_hadd_ps(tmp1, tmp1);
tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
tmp2 = _mm_sqrt_ps(tmp1);
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
}
//next two samples
_in += 2;
_out += 4;
}
// Regenerate phase
tmp1 = _mm_mul_ps(two_phase_acc_reg, two_phase_acc_reg);
tmp2 = _mm_hadd_ps(tmp1, tmp1);
tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
tmp2 = _mm_sqrt_ps(tmp1);
two_phase_acc_reg = _mm_div_ps(two_phase_acc_reg, tmp2);
}
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
for (j = 0; j < sse_iters % ROTATOR_RELOAD; j++)
{
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c1 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//next two samples
_in += 2;
a = _mm_set_ps((float)(lv_cimag(_in[1])), (float)(lv_creal(_in[1])), (float)(lv_cimag(_in[0])), (float)(lv_creal(_in[0]))); // //load (2 byte imag, 2 byte real) x 2 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in + 8);
//complex 32fc multiplication b=a*two_phase_acc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(a, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
a = _mm_shuffle_ps(a, a, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(a, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
b = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
c2 = _mm_cvtps_epi32(b); // convert from 32fc to 32ic
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
//complex 32fc multiplication two_phase_acc_reg=two_phase_acc_reg*two_phase_inc_reg
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg); // Load yh with ci,ci,di,di
tmp1 = _mm_mul_ps(two_phase_inc_reg, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
tmp3 = _mm_shuffle_ps(two_phase_inc_reg, two_phase_inc_reg, 0xB1); // Re-arrange x to be ai,ar,bi,br
tmp2 = _mm_mul_ps(tmp3, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
two_phase_acc_reg = _mm_addsub_ps(tmp1, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
// store four output samples
result = _mm_packs_epi32(c1, c2);// convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result);
// store four output samples
result = _mm_packs_epi32(c1, c2); // convert from 32ic to 16ic
_mm_storeu_si128((__m128i*)_out, result);
//next two samples
_in += 2;
_out += 4;
}
//next two samples
_in += 2;
_out += 4;
}
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for (n = sse_iters * 4; n < num_points; ++n)
{
tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
}
for (n = sse_iters * 4; n < num_points; ++n)
{
tmp16 = *_in++;
tmp32 = lv_cmake((float)lv_creal(tmp16), (float)lv_cimag(tmp16)) * (*phase);
*_out++ = lv_cmake((int16_t)rintf(lv_creal(tmp32)), (int16_t)rintf(lv_cimag(tmp32)));
(*phase) *= phase_inc;
}
}
#endif /* LV_HAVE_SSE3 */
@ -657,8 +663,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
lv_16sc_t* _out = outVector;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -667,8 +675,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
@ -681,7 +691,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
if (neon_iters > 0)
{
for(; i < neon_iters; ++i)
for (; i < neon_iters; ++i)
{
/* load 4 complex numbers (int 16 bits each component) */
tmp16 = vld2_s16((int16_t*)_in);
@ -745,8 +755,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag);
@ -757,7 +769,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
}
for(i = 0; i < neon_iters % 4; ++i)
for (i = 0; i < neon_iters % 4; ++i)
{
tmp16_ = *_in++;
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);
@ -791,8 +803,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
lv_16sc_t* _out = outVector;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -801,8 +815,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
@ -879,8 +895,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag);
@ -945,7 +963,7 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
(*phase) = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
}
for(i = 0; i < neon_iters % 4; ++i)
for (i = 0; i < neon_iters % 4; ++i)
{
tmp16_ = *_in++;
tmp32_ = lv_cmake((float32_t)lv_creal(tmp16_), (float32_t)lv_cimag(tmp16_)) * (*phase);

View File

@ -73,7 +73,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_generic(lv_16sc_t* result,
for (n = 0; n < num_points; n++)
{
lv_16sc_t tmp = in_a[n] * in_b[n];
result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp) ));
result[0] = lv_cmake(sat_adds16i(lv_creal(result[0]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[0]), lv_cimag(tmp)));
}
}
@ -96,7 +96,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
if (sse_iters > 0)
{
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
@ -104,25 +105,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
b = _mm_load_si128((__m128i*)_in_b);
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
realcacc = _mm_adds_epi16(realcacc, real);
imagcacc = _mm_adds_epi16(imagcacc, imag);
@ -136,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
a = _mm_or_si128(realcacc, imagcacc);
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
for (number = 0; number < 4; ++number)
{
@ -174,7 +175,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
if (sse_iters > 0)
{
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
@ -182,27 +184,27 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_a + 8);
b = _mm_loadu_si128((__m128i*)_in_b);
__VOLK_GNSSSDR_PREFETCH(_in_b + 8);
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr);
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
imag = _mm_adds_epi16(imag1, imag2); //with saturation arithmetic!
realcacc = _mm_adds_epi16(realcacc, real);
imagcacc = _mm_adds_epi16(imagcacc, imag);
@ -216,7 +218,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
result = _mm_or_si128(realcacc, imagcacc);
_mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
_mm_storeu_si128((__m128i*)dotProductVector, result); // Store the results back into the dot product vector
for (i = 0; i < 4; ++i)
{
@ -253,7 +255,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
if (avx_iters > 0)
{
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
__VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
realcacc = _mm256_setzero_si256();
imagcacc = _mm256_setzero_si256();
@ -261,7 +264,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(number = 0; number < avx_iters; number++)
for (number = 0; number < avx_iters; number++)
{
a = _mm256_loadu_si256((__m256i*)_in_a);
__VOLK_GNSSSDR_PREFETCH(_in_a + 16);
@ -269,7 +272,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
__VOLK_GNSSSDR_PREFETCH(_in_b + 16);
c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr);
b_sl = _mm256_slli_si256(b, 2);
@ -278,7 +281,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
imag1 = _mm256_mullo_epi16(a, b_sl);
imag2 = _mm256_mullo_epi16(b, a_sl);
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
realcacc = _mm256_adds_epi16(realcacc, real);
imagcacc = _mm256_adds_epi16(imagcacc, imag);
@ -292,7 +295,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc);
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_storeu_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_zeroupper();
for (i = 0; i < 8; ++i)
@ -330,7 +333,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
if (avx_iters > 0)
{
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
__VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
realcacc = _mm256_setzero_si256();
imagcacc = _mm256_setzero_si256();
@ -338,7 +342,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(number = 0; number < avx_iters; number++)
for (number = 0; number < avx_iters; number++)
{
a = _mm256_load_si256((__m256i*)_in_a);
__VOLK_GNSSSDR_PREFETCH(_in_a + 16);
@ -346,7 +350,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
__VOLK_GNSSSDR_PREFETCH(_in_b + 16);
c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr);
b_sl = _mm256_slli_si256(b, 2);
@ -355,7 +359,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
imag1 = _mm256_mullo_epi16(a, b_sl);
imag2 = _mm256_mullo_epi16(b, a_sl);
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
imag = _mm256_adds_epi16(imag1, imag2); //with saturation arithmetic!
realcacc = _mm256_adds_epi16(realcacc, real);
imagcacc = _mm256_adds_epi16(imagcacc, imag);
@ -369,7 +373,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
result = _mm256_or_si256(realcacc, imagcacc);
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_store_si256((__m256i*)dotProductVector, result); // Store the results back into the dot product vector
_mm256_zeroupper();
for (i = 0; i < 8; ++i)
@ -397,8 +401,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
unsigned int quarter_points = num_points / 4;
unsigned int number;
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
*out = lv_cmake((int16_t)0, (int16_t)0);
if (quarter_points > 0)
@ -407,15 +411,16 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
// 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, c_val, accumulator;
int16x4x2_t tmp_real, tmp_imag;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator.val[0] = vdup_n_s16(0);
accumulator.val[1] = vdup_n_s16(0);
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
for(number = 0; number < quarter_points; ++number)
for (number = 0; number < quarter_points; ++number)
{
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
@ -451,7 +456,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
}
// tail case
for(number = quarter_points * 4; number < num_points; ++number)
for (number = quarter_points * 4; number < num_points; ++number)
{
*out += (*a_ptr++) * (*b_ptr++);
}
@ -468,20 +473,21 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c
unsigned int quarter_points = num_points / 4;
unsigned int number;
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, accumulator;
int16x4x2_t tmp;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator.val[0] = vdup_n_s16(0);
accumulator.val[1] = vdup_n_s16(0);
for(number = 0; number < quarter_points; ++number)
for (number = 0; number < quarter_points; ++number)
{
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
@ -503,7 +509,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
for(number = quarter_points * 4; number < num_points; ++number)
for (number = quarter_points * 4; number < num_points; ++number)
{
*out += (*a_ptr++) * (*b_ptr++);
}
@ -520,22 +526,23 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
unsigned int quarter_points = num_points / 4;
unsigned int number;
lv_16sc_t* a_ptr = (lv_16sc_t*) in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*) in_b;
lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, accumulator1, accumulator2;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator1.val[0] = vdup_n_s16(0);
accumulator1.val[1] = vdup_n_s16(0);
accumulator2.val[0] = vdup_n_s16(0);
accumulator2.val[1] = vdup_n_s16(0);
for(number = 0; number < quarter_points; ++number)
for (number = 0; number < quarter_points; ++number)
{
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
__VOLK_GNSSSDR_PREFETCH(a_ptr + 8);
__VOLK_GNSSSDR_PREFETCH(b_ptr + 8);
@ -556,7 +563,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
*out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
// tail case
for(number = quarter_points * 4; number < num_points; ++number)
for (number = quarter_points * 4; number < num_points; ++number)
{
*out += (*a_ptr++) * (*b_ptr++);
}

View File

@ -74,7 +74,7 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(lv_16sc_t* resu
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
result[n_vec] = lv_cmake(0, 0);
for (n = 0; n < num_points; n++)
{
//r*a.r - i*a.i, i*a.r + r*a.i
@ -96,11 +96,11 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
result[n_vec] = lv_cmake(0, 0);
for (n = 0; n < num_points; n++)
{
lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), - sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))),
sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n]))));
lv_16sc_t tmp = lv_cmake(sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_creal(in_a[n_vec][n])), -sat_muls16i(lv_cimag(in_common[n]), lv_cimag(in_a[n_vec][n]))),
sat_adds16i(sat_muls16i(lv_creal(in_common[n]), lv_cimag(in_a[n_vec][n])), sat_muls16i(lv_cimag(in_common[n]), lv_creal(in_a[n_vec][n]))));
result[n_vec] = lv_cmake(sat_adds16i(lv_creal(result[n_vec]), lv_creal(tmp)), sat_adds16i(lv_cimag(result[n_vec]), lv_cimag(tmp)));
}
}
@ -112,9 +112,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(lv_16sc_t*
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
lv_16sc_t dotProduct = lv_cmake(0, 0);
int n_vec;
unsigned int index;
const unsigned int sse_iters = num_points / 4;
@ -125,7 +125,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
if (sse_iters > 0)
{
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -141,25 +142,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(index = 0; index < sse_iters; index++)
for (index = 0; index < sse_iters; index++)
{
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_load_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a = _mm_load_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr);
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
imag = _mm_adds_epi16(c, imag);
@ -176,12 +177,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -191,12 +192,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(index = sse_iters * 4; index < num_points; index++)
for (index = sse_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}
@ -206,9 +207,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
lv_16sc_t dotProduct = lv_cmake(0, 0);
int n_vec;
unsigned int index;
const unsigned int sse_iters = num_points / 4;
@ -219,7 +220,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
if (sse_iters > 0)
{
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -235,25 +237,25 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(index = 0; index < sse_iters; index++)
for (index = 0; index < sse_iters; index++)
{
// b[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_loadu_si128((__m128i*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a = _mm_loadu_si128((__m128i*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr);
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm_slli_si128(b, 2); // b3.r, b2.i ....
c = _mm_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
c_sr = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag = _mm_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
imag = _mm_adds_epi16(c, imag);
@ -270,12 +272,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
a = _mm_or_si128(realcacc[n_vec], imagcacc[n_vec]);
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -285,12 +287,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(index = sse_iters * 4; index < num_points; index++)
for (index = sse_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}
@ -300,9 +302,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
lv_16sc_t dotProduct = lv_cmake(0, 0);
int n_vec;
unsigned int index;
const unsigned int sse_iters = num_points / 8;
@ -313,7 +315,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
if (sse_iters > 0)
{
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
__VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -329,24 +332,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(index = 0; index < sse_iters; index++)
for (index = 0; index < sse_iters; index++)
{
b = _mm256_load_si256((__m256i*)_in_common);
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index*8]));
a = _mm256_load_si256((__m256i*)&(_in_a[n_vec][index * 8]));
c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr);
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
imag = _mm256_adds_epi16(c, imag);
@ -363,12 +366,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]);
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (index = 0; index < 8; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -379,12 +382,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(index = sse_iters * 8; index < num_points; index++)
for (index = sse_iters * 8; index < num_points; index++)
{
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}
@ -394,9 +397,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
#ifdef LV_HAVE_AVX2
#include <immintrin.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
lv_16sc_t dotProduct = lv_cmake(0, 0);
const unsigned int sse_iters = num_points / 8;
int n_vec;
@ -407,7 +410,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
if (sse_iters > 0)
{
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8];
__VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -423,24 +427,24 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(index = 0; index < sse_iters; index++)
for (index = 0; index < sse_iters; index++)
{
b = _mm256_loadu_si256((__m256i*)_in_common);
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index*8]));
a = _mm256_loadu_si256((__m256i*)&(_in_a[n_vec][index * 8]));
c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr);
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
c = _mm256_mullo_epi16(a, c_sr); // a3.i*b3.r, ....
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
c_sr = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag = _mm256_mullo_epi16(b, c_sr); // b3.i*a3.r, ....
imag = _mm256_adds_epi16(c, imag);
@ -457,12 +461,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
a = _mm256_or_si256(realcacc[n_vec], imagcacc[n_vec]);
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
_mm256_store_si256((__m256i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (index = 0; index < 8; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -473,12 +477,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(index = sse_iters * 8; index < num_points; index++)
for (index = sse_iters * 8; index < num_points; index++)
{
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}
@ -488,9 +492,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
lv_16sc_t dotProduct = lv_cmake(0, 0);
int n_vec;
unsigned int index;
const unsigned int neon_iters = num_points / 4;
@ -501,7 +505,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
if (neon_iters > 0)
{
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val, c_val;
@ -509,19 +514,19 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
int16x4x2_t tmp_real, tmp_imag;
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0);
}
for(index = 0; index < neon_iters; index++)
for (index = 0; index < neon_iters; index++)
{
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4])); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
//__VOLK_GNSSSDR_PREFETCH(&_in_a[n_vec][index*4] + 8);
// multiply the real*real and imag*imag to get real result
@ -547,12 +552,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -561,12 +566,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(index = neon_iters * 4; index < num_points; index++)
for (index = neon_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}
@ -576,9 +581,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
lv_16sc_t dotProduct = lv_cmake(0, 0);
const unsigned int neon_iters = num_points / 4;
int n_vec;
@ -589,25 +594,26 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
if (neon_iters > 0)
{
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val, tmp;
int16x4x2_t* accumulator = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator[n_vec].val[0] = vdup_n_s16(0);
accumulator[n_vec].val[1] = vdup_n_s16(0);
}
for(index = 0; index < neon_iters; index++)
for (index = 0; index < neon_iters; index++)
{
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4]));
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4]));
tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
@ -624,12 +630,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
vst2_s16((int16_t*)dotProductVector, accumulator[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -638,12 +644,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(index = neon_iters * 4; index < num_points; index++)
for (index = neon_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}
@ -653,9 +659,9 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t* result, const lv_16sc_t* in_common, const lv_16sc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_16sc_t dotProduct = lv_cmake(0,0);
lv_16sc_t dotProduct = lv_cmake(0, 0);
const unsigned int neon_iters = num_points / 4;
int n_vec;
@ -666,14 +672,15 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
if (neon_iters > 0)
{
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val;
int16x4x2_t* accumulator1 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
int16x4x2_t* accumulator2 = (int16x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(int16x4x2_t), volk_gnsssdr_get_alignment());
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator1[n_vec].val[0] = vdup_n_s16(0);
accumulator1[n_vec].val[1] = vdup_n_s16(0);
@ -681,13 +688,13 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
accumulator2[n_vec].val[1] = vdup_n_s16(0);
}
for(index = 0; index < neon_iters; index++)
for (index = 0; index < neon_iters; index++)
{
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b_val = vld2_s16((int16_t*)_in_common); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
__VOLK_GNSSSDR_PREFETCH(_in_common + 8);
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index*4]));
a_val = vld2_s16((int16_t*)&(_in_a[n_vec][index * 4]));
accumulator1[n_vec].val[0] = vmla_s16(accumulator1[n_vec].val[0], a_val.val[0], b_val.val[0]);
accumulator1[n_vec].val[1] = vmla_s16(accumulator1[n_vec].val[1], a_val.val[0], b_val.val[1]);
@ -705,12 +712,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
vst2_s16((int16_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (index = 0; index < 4; ++index)
{
dotProduct = lv_cmake(sat_adds16i(lv_creal(dotProduct), lv_creal(dotProductVector[index])),
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
sat_adds16i(lv_cimag(dotProduct), lv_cimag(dotProductVector[index])));
}
_out[n_vec] = dotProduct;
}
@ -720,12 +727,12 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
for(index = neon_iters * 4; index < num_points; index++)
for (index = neon_iters * 4; index < num_points; index++)
{
lv_16sc_t tmp = in_common[index] * in_a[n_vec][index];
_out[n_vec] = lv_cmake(sat_adds16i(lv_creal(_out[n_vec]), lv_creal(tmp)),
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
}
}
}

View File

@ -47,22 +47,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic(lv_16sc_t*
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif /* Generic */
#endif /* Generic */
#ifdef LV_HAVE_GENERIC
@ -71,22 +71,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_generic_sat(lv_16sc
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_generic_sat(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif /* Generic */
#endif /* Generic */
#ifdef LV_HAVE_SSE2
@ -95,18 +95,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_sse2(lv_16sc_t* r
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
@ -120,18 +120,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_sse2(lv_16sc_t* r
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
@ -145,18 +145,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_a_avx2(lv_16sc_t* r
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
@ -170,18 +170,18 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_u_avx2(lv_16sc_t* r
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
@ -195,22 +195,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon(lv_16sc_t* res
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // NEON
#endif // NEON
#ifdef LV_HAVE_NEON
@ -220,22 +220,22 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_vma(lv_16sc_t*
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // NEON
#endif // NEON
#ifdef LV_HAVE_NEON
@ -244,23 +244,21 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
unsigned int n;
for(n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t)*num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t)*num_points);
}
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(result, local_code, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // NEON
#endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H

View File

@ -91,29 +91,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t* out, con
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a = _mm_load_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_load_si128((__m128i*)_in_b);
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr);
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr);
real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2);
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm_or_si128 (real, imag);
result = _mm_or_si128(real, imag);
_mm_store_si128((__m128i*)_out, result);
@ -137,7 +137,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
{
const unsigned int sse_iters = num_points / 4;
unsigned int number;
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1,imag2, b_sl, a_sl, result;
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, result;
mask_imag = _mm_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
mask_real = _mm_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
@ -145,29 +145,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t* out, con
const lv_16sc_t* _in_a = in_a;
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
//std::complex<T> memory structure: real part -> reinterpret_cast<cv T*>(a)[2*i]
//imaginery part -> reinterpret_cast<cv T*>(a)[2*i + 1]
// a[127:0]=[a3.i,a3.r,a2.i,a2.r,a1.i,a1.r,a0.i,a0.r]
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
a = _mm_loadu_si128((__m128i*)_in_a); //load (2 byte imag, 2 byte real) x 4 into 128 bits reg
b = _mm_loadu_si128((__m128i*)_in_b);
c = _mm_mullo_epi16 (a, b); // a3.i*b3.i, a3.r*b3.r, ....
c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
c_sr = _mm_srli_si128 (c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16 (c, c_sr);
real = _mm_and_si128 (real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm_subs_epi16(c, c_sr);
real = _mm_and_si128(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm_adds_epi16(imag1, imag2);
imag = _mm_and_si128 (imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm_or_si128 (real, imag);
result = _mm_or_si128(real, imag);
_mm_storeu_si128((__m128i*)_out, result);
@ -196,29 +196,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(;number < avx2_points; number++)
for (; number < avx2_points; number++)
{
a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
a = _mm256_loadu_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
b = _mm256_loadu_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr);
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm256_adds_epi16(imag1, imag2);
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm256_or_si256(real, imag);
@ -230,7 +230,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out, con
}
_mm256_zeroupper();
number = avx2_points * 8;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
*_out++ = (*_in_a++) * (*_in_b++);
}
@ -250,29 +250,29 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
const lv_16sc_t* _in_b = in_b;
lv_16sc_t* _out = out;
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
__m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
const __m256i mask_imag = _mm256_set_epi8(0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
const __m256i mask_real = _mm256_set_epi8(0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
for(;number < avx2_points; number++)
for (; number < avx2_points; number++)
{
a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
a = _mm256_load_si256((__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
b = _mm256_load_si256((__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
c = _mm256_mullo_epi16(a, b);
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in zeros, and store the results in dst.
real = _mm256_subs_epi16(c, c_sr);
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
real = _mm256_and_si256(real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
imag = _mm256_adds_epi16(imag1, imag2);
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
result = _mm256_or_si256(real, imag);
@ -284,7 +284,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
}
_mm256_zeroupper();
number = avx2_points * 8;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
*_out++ = (*_in_a++) * (*_in_b++);
}
@ -292,23 +292,22 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const lv_16sc_t* in_a, const lv_16sc_t* in_b, unsigned int num_points)
{
lv_16sc_t *a_ptr = (lv_16sc_t*) in_a;
lv_16sc_t *b_ptr = (lv_16sc_t*) in_b;
lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
unsigned int quarter_points = num_points / 4;
int16x4x2_t a_val, b_val, c_val;
int16x4x2_t tmp_real, tmp_imag;
unsigned int number = 0;
for(number = 0; number < quarter_points; ++number)
for (number = 0; number < quarter_points; ++number)
{
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
__VOLK_GNSSSDR_PREFETCH(a_ptr + 4);
__VOLK_GNSSSDR_PREFETCH(b_ptr + 4);
@ -334,7 +333,7 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_neon(lv_16sc_t* out, const
out += 4;
}
for(number = quarter_points * 4; number < num_points; number++)
for (number = quarter_points * 4; number < num_points; number++)
{
*out++ = (*a_ptr++) * (*b_ptr++);
}

View File

@ -41,7 +41,7 @@
#include <string.h>
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345;
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic(lv_
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_reload(lv_16sc_t* result, const lv_16sc_t* local_code, const lv_16sc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.345;
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_generic_rel
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase,(const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -113,22 +113,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3(lv_1
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // SSE3
#endif // SSE3
#ifdef LV_HAVE_SSE3
@ -144,22 +144,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_sse3_relo
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // SSE3
#endif // SSE3
#ifdef LV_HAVE_SSE3
@ -175,22 +175,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_sse3(lv_1
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // SSE3
#endif // SSE3
#ifdef LV_HAVE_AVX2
@ -206,22 +206,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2(lv_1
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // AVX2
#endif // AVX2
#ifdef LV_HAVE_AVX2
@ -237,22 +237,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_a_avx2_relo
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // AVX2
#endif // AVX2
#ifdef LV_HAVE_AVX2
@ -268,22 +268,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2(lv_1
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // AVX2
#endif // AVX2
#ifdef LV_HAVE_AVX2
@ -299,22 +299,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_u_avx2_relo
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // AVX2
#endif // AVX2
#ifdef LV_HAVE_NEON
@ -330,22 +330,22 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon(lv_16s
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // NEON
#endif // NEON
#ifdef LV_HAVE_NEON
@ -361,23 +361,21 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv
unsigned int n;
int num_a_vectors = 3;
lv_16sc_t** in_a = (lv_16sc_t**)volk_gnsssdr_malloc(sizeof(lv_16sc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_16sc_t*)volk_gnsssdr_malloc(sizeof(lv_16sc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_16sc_t*)in_a[n], (lv_16sc_t*)in, sizeof(lv_16sc_t) * num_points);
}
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(result, local_code, phase_inc[0], phase, (const lv_16sc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
volk_gnsssdr_free(in_a);
}
#endif // NEON
#endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H

View File

@ -106,7 +106,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -120,7 +121,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -138,13 +139,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -156,7 +157,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
}
}
#endif
#endif
#ifdef LV_HAVE_SSE4_1
@ -172,7 +173,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -186,7 +188,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -204,13 +206,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -239,7 +241,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -253,7 +256,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -274,13 +277,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -309,7 +312,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -323,7 +327,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -344,13 +348,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -378,7 +382,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -393,7 +398,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -411,13 +416,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -427,7 +432,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -455,7 +460,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -470,7 +476,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -488,13 +494,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -504,7 +510,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
@ -530,7 +536,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0);
@ -538,11 +545,12 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec);
int current_correlator_tap;
unsigned int n;
@ -552,7 +560,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < neon_iters; n++)
for (n = 0; n < neon_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
@ -568,7 +576,7 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
// fmod
c = vmulq_f32(aux, reciprocal);
i = vcvtq_s32_f32(c);
i = vcvtq_s32_f32(c);
cTrunc = vcvtq_f32_s32(i);
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
aux = vsubq_f32(aux, base);
@ -580,13 +588,13 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = vaddq_f32(indexn, fours);
}
for(n = neon_iters * 4; n < num_points; n++)
for (n = neon_iters * 4; n < num_points; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap
@ -604,4 +612,3 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/

View File

@ -95,69 +95,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_generic(lv_16sc_t
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips;
__m128 _rem_code_phase,_code_phase_step_chips;
__m128i _code_length_chips,_code_length_chips_minus1;
__m128 _code_phase_out,_code_phase_out_with_offset;
__m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset;
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
_code_length_chips = _mm_load_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_load_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over;
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_load_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_load_ps(init_4constant_float);
int current_vector = 0;
int sample_idx = 0;
for(number = 0; number < quarterPoints; number++)
for (number = 0; number < quarterPoints; number++)
{
//common to all outputs
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
//output vector dependant (different code phase offset)
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
_mm_store_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
@ -169,9 +174,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
sample_idx += 4;
}
for(number = quarterPoints * 4; number < num_output_samples; number++)
for (number = quarterPoints * 4; number < num_output_samples; number++)
{
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
@ -186,69 +191,74 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);//_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); //_MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips;
__m128 _rem_code_phase,_code_phase_step_chips;
__m128i _code_length_chips,_code_length_chips_minus1;
__m128 _code_phase_out,_code_phase_out_with_offset;
__m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset;
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
_code_length_chips = _mm_loadu_si128((__m128i*)&four_times_code_length_chips); //load float to all four float values in m128 register
_code_length_chips_minus1 = _mm_loadu_si128((__m128i*)&four_times_code_length_chips_minus1); //load float to all four float values in m128 register
__m128i negative_indexes, overflow_indexes,_code_phase_out_int, _code_phase_out_int_neg,_code_phase_out_int_over;
__m128i negative_indexes, overflow_indexes, _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
__m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_loadu_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
int current_vector = 0;
int sample_idx = 0;
for(number = 0; number < quarterPoints; number++)
for (number = 0; number < quarterPoints; number++)
{
//common to all outputs
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out = _mm_mul_ps(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
//output vector dependant (different code phase offset)
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = _mm_load1_ps(&tmp_rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
_code_phase_out_with_offset = _mm_add_ps(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128( negative_indexes, _mm_xor_si128( _code_phase_out_int_neg, _code_phase_out_int )));
negative_indexes = _mm_cmplt_epi32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = _mm_add_epi32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = _mm_xor_si128(_code_phase_out_int, _mm_and_si128(negative_indexes, _mm_xor_si128(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128( overflow_indexes, _mm_xor_si128( _code_phase_out_int_over, _code_phase_out_int_neg )));
overflow_indexes = _mm_cmpgt_epi32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = _mm_sub_epi32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = _mm_xor_si128(_code_phase_out_int_neg, _mm_and_si128(overflow_indexes, _mm_xor_si128(_code_phase_out_int_over, _code_phase_out_int_neg)));
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
_mm_storeu_si128((__m128i*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
@ -260,9 +270,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
sample_idx += 4;
}
for(number = quarterPoints * 4; number < num_output_samples; number++)
for (number = quarterPoints * 4; number < num_output_samples; number++)
{
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;
@ -278,74 +288,79 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
#ifdef LV_HAVE_NEON
#include <arm_neon.h>
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips ,float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t** result, const lv_16sc_t* local_code, float* rem_code_phase_chips, float code_phase_step_chips, unsigned int code_length_chips, int num_out_vectors, unsigned int num_output_samples)
{
unsigned int number;
const unsigned int quarterPoints = num_output_samples / 4;
float32x4_t half = vdupq_n_f32(0.5f);
lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips;
float32x4_t _rem_code_phase, _code_phase_step_chips;
int32x4_t _code_length_chips, _code_length_chips_minus1;
float32x4_t _code_phase_out, _code_phase_out_with_offset;
float32x4_t sign, PlusHalf, Round;
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4];
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4];
__VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips;
four_times_code_length_chips[3] = code_length_chips;
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register
_code_length_chips = vld1q_s32((int32_t*)&four_times_code_length_chips); //load float to all four float values in float32x4_t register
_code_length_chips_minus1 = vld1q_s32((int32_t*)&four_times_code_length_chips_minus1); //load float to all four float values in float32x4_t register
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
int32x4_t _code_phase_out_int, _code_phase_out_int_neg, _code_phase_out_int_over;
uint32x4_t negative_indexes, overflow_indexes;
int32x4_t zero = vmovq_n_s32(0);
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
float32x4_t _4output_index = vld1q_f32(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f };
__VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
int current_vector = 0;
int sample_idx = 0;
for(number = 0; number < quarterPoints; number++)
for (number = 0; number < quarterPoints; number++)
{
//common to all outputs
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
_code_phase_out = vmulq_f32(_code_phase_step_chips, _4output_index); //compute the code phase point with the phase step
//output vector dependant (different code phase offset)
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register
tmp_rem_code_phase_chips = rem_code_phase_chips[current_vector] - 0.5f; // adjust offset to perform correct rounding (chip transition at 0)
_rem_code_phase = vld1q_dup_f32(&tmp_rem_code_phase_chips); //load float to all four float values in float32x4_t register
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
_code_phase_out_with_offset = vaddq_f32(_code_phase_out, _rem_code_phase); //add the phase offset
//_code_phase_out_int = _mm_cvtps_epi32(_code_phase_out_with_offset); //convert to integer
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(_code_phase_out_with_offset), 31)));
PlusHalf = vaddq_f32(_code_phase_out_with_offset, half);
Round = vsubq_f32(PlusHalf, sign);
_code_phase_out_int = vcvtq_s32_f32(Round);
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32( (int32x4_t)negative_indexes, veorq_s32( _code_phase_out_int_neg, _code_phase_out_int )));
negative_indexes = vcltq_s32(_code_phase_out_int, zero); //test for negative values
_code_phase_out_int_neg = vaddq_s32(_code_phase_out_int, _code_length_chips); //the negative values branch
_code_phase_out_int_neg = veorq_s32(_code_phase_out_int, vandq_s32((int32x4_t)negative_indexes, veorq_s32(_code_phase_out_int_neg, _code_phase_out_int)));
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32( (int32x4_t)overflow_indexes, veorq_s32( _code_phase_out_int_over, _code_phase_out_int_neg )));
overflow_indexes = vcgtq_s32(_code_phase_out_int_neg, _code_length_chips_minus1); //test for overflow values
_code_phase_out_int_over = vsubq_s32(_code_phase_out_int_neg, _code_length_chips); //the negative values branch
_code_phase_out_int_over = veorq_s32(_code_phase_out_int_neg, vandq_s32((int32x4_t)overflow_indexes, veorq_s32(_code_phase_out_int_over, _code_phase_out_int_neg)));
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
vst1q_s32((int32_t*)local_code_chip_index, _code_phase_out_int_over); // Store the results back
//todo: optimize the local code lookup table with intrinsics, if possible
_result[current_vector][sample_idx] = local_code[local_code_chip_index[0]];
@ -357,9 +372,9 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
sample_idx += 4;
}
for(number = quarterPoints * 4; number < num_output_samples; number++)
for (number = quarterPoints * 4; number < num_output_samples; number++)
{
for(current_vector = 0; current_vector < num_out_vectors; current_vector++)
for (current_vector = 0; current_vector < num_out_vectors; current_vector++)
{
local_code_chip_index[0] = (int)(code_phase_step_chips * (float)(number) + rem_code_phase_chips[current_vector]);
if (local_code_chip_index[0] < 0.0) local_code_chip_index[0] += code_length_chips - 1;

View File

@ -29,7 +29,6 @@
*/
/*!
* \page volk_gnsssdr_32f_index_max_32u.h
*
@ -63,7 +62,7 @@
static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const float* src0, uint32_t num_points)
{
if(num_points > 0)
if (num_points > 0)
{
uint32_t number = 0;
const uint32_t quarterPoints = num_points / 8;
@ -71,7 +70,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
float* inputPtr = (float*)src0;
__m256 indexIncrementValues = _mm256_set1_ps(8);
__m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
__m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
float max = src0[0];
float index = 0;
@ -80,25 +79,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
__m256 compareResults;
__m256 currentValues;
__VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
__VOLK_ATTR_ALIGNED(32)
float maxValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32)
float maxIndexesBuffer[8];
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
currentValues = _mm256_load_ps(inputPtr); inputPtr += 8;
currentValues = _mm256_load_ps(inputPtr);
inputPtr += 8;
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
}
// Calculate the largest value from the remaining 8 points
_mm256_store_ps(maxValuesBuffer, maxValues);
_mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 8; number++)
for (number = 0; number < 8; number++)
{
if(maxValuesBuffer[number] > max)
if (maxValuesBuffer[number] > max)
{
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
@ -106,9 +108,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
}
number = quarterPoints * 8;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
if(src0[number] > max)
if (src0[number] > max)
{
index = number;
max = src0[number];
@ -126,7 +128,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const float* src0, uint32_t num_points)
{
if(num_points > 0)
if (num_points > 0)
{
uint32_t number = 0;
const uint32_t quarterPoints = num_points / 8;
@ -134,7 +136,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
float* inputPtr = (float*)src0;
__m256 indexIncrementValues = _mm256_set1_ps(8);
__m256 currentIndexes = _mm256_set_ps(-1,-2,-3,-4,-5,-6,-7,-8);
__m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
float max = src0[0];
float index = 0;
@ -143,25 +145,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
__m256 compareResults;
__m256 currentValues;
__VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
__VOLK_ATTR_ALIGNED(32)
float maxValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32)
float maxIndexesBuffer[8];
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8;
currentValues = _mm256_loadu_ps(inputPtr);
inputPtr += 8;
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
maxValues = _mm256_blendv_ps(currentValues, maxValues, compareResults);
}
// Calculate the largest value from the remaining 8 points
_mm256_store_ps(maxValuesBuffer, maxValues);
_mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 8; number++)
for (number = 0; number < 8; number++)
{
if(maxValuesBuffer[number] > max)
if (maxValuesBuffer[number] > max)
{
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
@ -169,9 +174,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
}
number = quarterPoints * 8;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
if(src0[number] > max)
if (src0[number] > max)
{
index = number;
max = src0[number];
@ -185,11 +190,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
#ifdef LV_HAVE_SSE4_1
#include<smmintrin.h>
#include <smmintrin.h>
static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
{
if(num_points > 0)
if (num_points > 0)
{
uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
@ -197,7 +202,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
float max = src0[0];
float index = 0;
@ -206,25 +211,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
__m128 compareResults;
__m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
}
// Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++)
for (number = 0; number < 4; number++)
{
if(maxValuesBuffer[number] > max)
if (maxValuesBuffer[number] > max)
{
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
@ -232,9 +240,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
}
number = quarterPoints * 4;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
if(src0[number] > max)
if (src0[number] > max)
{
index = number;
max = src0[number];
@ -248,11 +256,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
#ifdef LV_HAVE_SSE4_1
#include<smmintrin.h>
#include <smmintrin.h>
static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
{
if(num_points > 0)
if (num_points > 0)
{
uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
@ -260,7 +268,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
float max = src0[0];
float index = 0;
@ -269,25 +277,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
__m128 compareResults;
__m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
currentValues = _mm_loadu_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
}
// Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++)
for (number = 0; number < 4; number++)
{
if(maxValuesBuffer[number] > max)
if (maxValuesBuffer[number] > max)
{
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
@ -295,9 +306,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
}
number = quarterPoints * 4;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
if(src0[number] > max)
if (src0[number] > max)
{
index = number;
max = src0[number];
@ -312,11 +323,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
#ifdef LV_HAVE_SSE
#include<xmmintrin.h>
#include <xmmintrin.h>
static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
{
if(num_points > 0)
if (num_points > 0)
{
uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
@ -324,7 +335,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
float max = src0[0];
float index = 0;
@ -333,25 +344,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
__m128 compareResults;
__m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes));
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues));
}
// Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++)
for (number = 0; number < 4; number++)
{
if(maxValuesBuffer[number] > max)
if (maxValuesBuffer[number] > max)
{
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
@ -359,9 +373,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
}
number = quarterPoints * 4;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
if(src0[number] > max)
if (src0[number] > max)
{
index = number;
max = src0[number];
@ -376,11 +390,11 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
#ifdef LV_HAVE_SSE
#include<xmmintrin.h>
#include <xmmintrin.h>
static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const float* src0, uint32_t num_points)
{
if(num_points > 0)
if (num_points > 0)
{
uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
@ -388,7 +402,7 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
float* inputPtr = (float*)src0;
__m128 indexIncrementValues = _mm_set1_ps(4);
__m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
__m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
float max = src0[0];
float index = 0;
@ -397,25 +411,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
__m128 compareResults;
__m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4;
currentValues = _mm_loadu_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes));
maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues), _mm_andnot_ps(compareResults, currentValues));
}
// Calculate the largest value from the remaining 4 points
_mm_store_ps(maxValuesBuffer, maxValues);
_mm_store_ps(maxIndexesBuffer, maxValuesIndex);
for(number = 0; number < 4; number++)
for (number = 0; number < 4; number++)
{
if(maxValuesBuffer[number] > max)
if (maxValuesBuffer[number] > max)
{
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
@ -423,9 +440,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
}
number = quarterPoints * 4;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
if(src0[number] > max)
if (src0[number] > max)
{
index = number;
max = src0[number];
@ -442,16 +459,16 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
{
if(num_points > 0)
if (num_points > 0)
{
float max = src0[0];
uint32_t index = 0;
uint32_t i = 1;
for(; i < num_points; ++i)
for (; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -469,14 +486,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_generic(uint32_t* target, cons
static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const float* src0, uint32_t num_points)
{
if(num_points > 0)
if (num_points > 0)
{
uint32_t number = 0;
const uint32_t quarterPoints = num_points / 4;
float* inputPtr = (float*)src0;
float32x4_t indexIncrementValues = vdupq_n_f32(4);
__VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
__VOLK_ATTR_ALIGNED(16)
float currentIndexes_float[4] = {-4.0f, -3.0f, -2.0f, -1.0f};
float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
float max = src0[0];
@ -487,25 +505,28 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
uint32x4_t currentIndexes_u;
float32x4_t currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
currentValues = vld1q_f32(inputPtr); inputPtr += 4;
currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
currentValues = vld1q_f32(inputPtr);
inputPtr += 4;
currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
currentIndexes_u = vcvtq_u32_f32(currentIndexes);
compareResults = vcgtq_f32( maxValues, currentValues);
maxValuesIndex = vorrq_u32( vandq_u32( compareResults, maxValuesIndex ), vbicq_u32(currentIndexes_u, compareResults) );
maxValues = vmaxq_f32(currentValues, maxValues);
compareResults = vcgtq_f32(maxValues, currentValues);
maxValuesIndex = vorrq_u32(vandq_u32(compareResults, maxValuesIndex), vbicq_u32(currentIndexes_u, compareResults));
maxValues = vmaxq_f32(currentValues, maxValues);
}
// Calculate the largest value from the remaining 4 points
vst1q_f32(maxValuesBuffer, maxValues);
vst1q_f32(maxIndexesBuffer, vcvtq_f32_u32(maxValuesIndex));
for(number = 0; number < 4; number++)
for (number = 0; number < 4; number++)
{
if(maxValuesBuffer[number] > max)
if (maxValuesBuffer[number] > max)
{
index = maxIndexesBuffer[number];
max = maxValuesBuffer[number];
@ -513,9 +534,9 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
}
number = quarterPoints * 4;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
if(src0[number] > max)
if (src0[number] > max)
{
index = number;
max = src0[number];
@ -528,4 +549,3 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
#endif /*LV_HAVE_NEON*/
#endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/

View File

@ -42,31 +42,30 @@
#include <string.h>
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32f_xn_resampler_32f_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -77,26 +76,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result,
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -106,26 +105,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse3(float* result,
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -136,26 +135,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse3(float* result,
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -165,26 +164,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_sse4_1(float* result
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -194,26 +193,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_sse4_1(float* result
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
@ -223,26 +222,26 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_a_avx(float* result, c
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
@ -251,29 +250,28 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_u_avx(float* result, c
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, const float* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
float** result_aux = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32f_xn_resampler_32f_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((float*)result, (float*)result_aux[0], sizeof(float) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H
#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H

View File

@ -97,7 +97,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
cp4 = _mm_set1_ps(0.49603e-4);
cp5 = _mm_set1_ps(0.551e-6);
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
aVal = _mm_loadu_ps(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -108,12 +108,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
s = _mm_mul_ps(s, s);
// Evaluate Taylor series
s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
for(i = 0; i < 3; i++)
for (i = 0; i < 3; i++)
{
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
}
@ -145,7 +145,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse4_1(lv_32fc_t* out, const f
}
number = quarterPoints * 4;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
float _in = *aPtr++;
*bPtr++ = lv_cmake(cosf(_in), sinf(_in));
@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
cp4 = _mm_set1_ps(0.49603e-4);
cp5 = _mm_set1_ps(0.551e-6);
for(;number < quarterPoints; number++)
for (; number < quarterPoints; number++)
{
aVal = _mm_load_ps(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -202,12 +202,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
s = _mm_mul_ps(s, s);
// Evaluate Taylor series
s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s), cp3), s), cp2), s), cp1), s);
for(i = 0; i < 3; i++)
for (i = 0; i < 3; i++)
{
s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
}
@ -239,7 +239,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse4_1(lv_32fc_t* out, const f
}
number = quarterPoints * 4;
for(;number < num_points; number++)
for (; number < num_points; number++)
{
float _in = *aPtr++;
*bPtr++ = lv_cmake(cosf(_in), sinf(_in));
@ -265,31 +265,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
__m128 sine, cosine, aux, x;
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
__m128i emm0, emm2, emm4;
__m128i emm0, emm2, emm4;
/* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
__VOLK_ATTR_ALIGNED(16)
static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
for(;number < sse_iters; number++)
for (; number < sse_iters; number++)
{
x = _mm_load_ps(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -307,19 +325,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1);
emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4);
emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2);
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 poly_mask = _mm_castsi128_ps(emm2);
@ -335,15 +353,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4);
emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
emm4 = _mm_slli_epi32(emm4, 29);
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x);
__m128 z = _mm_mul_ps(x, x);
y = *(__m128*)_ps_coscof_p0;
y = _mm_mul_ps(y, z);
@ -371,11 +389,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y, y2);
/* update the sign */
sine = _mm_xor_ps(xmm1, sign_bit_sin);
@ -392,12 +410,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
aPtr += 4;
}
for(number = sse_iters * 4; number < num_points; number++)
for (number = sse_iters * 4; number < num_points; number++)
{
_in = *aPtr++;
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) );
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
}
}
#endif /* LV_HAVE_SSE2 */
@ -418,31 +435,49 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
__m128 sine, cosine, aux, x;
__m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
__m128i emm0, emm2, emm4;
__m128i emm0, emm2, emm4;
/* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
__VOLK_ATTR_ALIGNED(16)
static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
for(;number < sse_iters; number++)
for (; number < sse_iters; number++)
{
x = _mm_loadu_ps(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -460,19 +495,19 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
emm2 = _mm_cvttps_epi32(y);
/* j=(j+1) & (~1) (see the cephes sources) */
emm2 = _mm_add_epi32(emm2, *(__m128i *)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_inv1);
emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
y = _mm_cvtepi32_ps(emm2);
emm4 = emm2;
/* get the swap sign flag for the sine */
emm0 = _mm_and_si128(emm2, *(__m128i *)_pi32_4);
emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
emm0 = _mm_slli_epi32(emm0, 29);
__m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
/* get the polynom selection mask for the sine*/
emm2 = _mm_and_si128(emm2, *(__m128i *)_pi32_2);
emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
__m128 poly_mask = _mm_castsi128_ps(emm2);
@ -488,15 +523,15 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
x = _mm_add_ps(x, xmm2);
x = _mm_add_ps(x, xmm3);
emm4 = _mm_sub_epi32(emm4, *(__m128i *)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(__m128i *)_pi32_4);
emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2);
emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4);
emm4 = _mm_slli_epi32(emm4, 29);
__m128 sign_bit_cos = _mm_castsi128_ps(emm4);
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x);
__m128 z = _mm_mul_ps(x, x);
y = *(__m128*)_ps_coscof_p0;
y = _mm_mul_ps(y, z);
@ -524,11 +559,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y, y2);
/* update the sign */
sine = _mm_xor_ps(xmm1, sign_bit_sin);
@ -545,12 +580,11 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
aPtr += 4;
}
for(number = sse_iters * 4; number < num_points; number++)
for (number = sse_iters * 4; number < num_points; number++)
{
_in = *aPtr++;
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) );
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
}
}
#endif /* LV_HAVE_SSE2 */
@ -561,10 +595,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic(lv_32fc_t* out, const fl
{
float _in;
unsigned int i;
for(i = 0; i < num_points; i++)
for (i = 0; i < num_points; i++)
{
_in = *in++;
*out++ = lv_cmake((float)cosf(_in), (float)sinf(_in) );
*out++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
}
}
@ -586,12 +620,12 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
const int32_t diffbits = bitlength - Nbits;
uint32_t ux;
unsigned int i;
for(i = 0; i < num_points; i++)
for (i = 0; i < num_points; i++)
{
_in = *in++;
d = (int32_t)floor(_in / TWO_PI + 0.5);
_in -= d * TWO_PI;
x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI);
x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI);
ux = x;
sin_index = ux >> diffbits;
@ -601,7 +635,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, con
cos_index = ux >> diffbits;
c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1];
*out++ = lv_cmake((float)c, (float)s );
*out++ = lv_cmake((float)c, (float)s);
}
}
@ -637,7 +671,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos;
for(;number < neon_iters; number++)
for (; number < neon_iters; number++)
{
x = vld1q_f32(aPtr);
__VOLK_GNSSSDR_PREFETCH(aPtr + 8);
@ -677,7 +711,7 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
and the second polynom (Pi/4 <= x <= 0) in y2 */
z = vmulq_f32(x,x);
z = vmulq_f32(x, x);
y1 = vmulq_n_f32(z, c_coscof_p0);
y2 = vmulq_n_f32(z, c_sincof_p0);
@ -706,10 +740,10 @@ static inline void volk_gnsssdr_32f_sincos_32fc_neon(lv_32fc_t* out, const float
aPtr += 4;
}
for(number = neon_iters * 4; number < num_points; number++)
for (number = neon_iters * 4; number < num_points; number++)
{
_in = *aPtr++;
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in) );
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
}
}

View File

@ -110,7 +110,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -124,7 +125,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -145,25 +146,25 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
}
}
#endif
#endif
#ifdef LV_HAVE_SSE3
@ -180,7 +181,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -194,7 +196,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -215,18 +217,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -248,7 +250,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -262,7 +265,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -280,25 +283,25 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
}
}
#endif
#endif
#ifdef LV_HAVE_SSE4_1
@ -314,7 +317,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -328,7 +332,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -346,18 +350,18 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -380,7 +384,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -395,7 +400,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -413,13 +418,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -429,12 +434,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -457,7 +462,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -472,7 +478,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -490,13 +496,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -506,12 +512,12 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -536,19 +542,21 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0);
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec);
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
@ -556,7 +564,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < neon_iters; n++)
for (n = 0; n < neon_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
@ -572,7 +580,7 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
// fmod
c = vmulq_f32(aux, reciprocal);
i = vcvtq_s32_f32(c);
i = vcvtq_s32_f32(c);
cTrunc = vcvtq_f32_s32(i);
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
aux = vsubq_f32(aux, base);
@ -584,13 +592,13 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = vaddq_f32(indexn, fours);
}
for(n = neon_iters * 4; n < num_points; n++)
for (n = neon_iters * 4; n < num_points; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap
@ -606,5 +614,3 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
#endif
#endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/

View File

@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(lv_32f
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
result[n_vec] = lv_cmake(0, 0);
}
for (n = 0; n < num_points; n++)
{
tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
// Regenerate phase
if (n % 256 == 0)
@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
result[n_vec] = lv_cmake(0, 0);
}
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload
result[n_vec] += tmp32_2;
}
}
/* Regenerate phase */
/* Regenerate phase */
#ifdef __cplusplus
(*phase) /= std::abs((*phase));
#else
@ -175,8 +175,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
const unsigned int sixteenthPoints = num_points / 16;
const float* aPtr = (float*)in_common;
const float* bPtr[ num_a_vectors];
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
const float* bPtr[num_a_vectors];
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{
bPtr[vec_ind] = in_a[vec_ind];
}
@ -194,7 +194,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
__m256 dotProdVal2[num_a_vectors];
__m256 dotProdVal3[num_a_vectors];
for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ )
for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++)
{
dotProdVal0[vec_ind] = _mm256_setzero_ps();
dotProdVal1[vec_ind] = _mm256_setzero_ps();
@ -204,57 +204,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
// Set up the complex rotator
__m256 z0, z1, z2, z3;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16];
for( vec_ind = 0; vec_ind < 16; ++vec_ind )
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t phase_vec[16];
for (vec_ind = 0; vec_ind < 16; ++vec_ind)
{
phase_vec[vec_ind] = _phase;
_phase *= phase_inc;
}
z0 = _mm256_load_ps( (float *)phase_vec );
z1 = _mm256_load_ps( (float *)(phase_vec + 4) );
z2 = _mm256_load_ps( (float *)(phase_vec + 8) );
z3 = _mm256_load_ps( (float *)(phase_vec + 12) );
z0 = _mm256_load_ps((float*)phase_vec);
z1 = _mm256_load_ps((float*)(phase_vec + 4));
z2 = _mm256_load_ps((float*)(phase_vec + 8));
z3 = _mm256_load_ps((float*)(phase_vec + 12));
lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16;
lv_32fc_t dz = phase_inc;
dz *= dz;
dz *= dz;
dz *= dz;
dz *= dz; // dz = phase_inc^16;
for( vec_ind = 0; vec_ind < 4; ++vec_ind )
for (vec_ind = 0; vec_ind < 4; ++vec_ind)
{
phase_vec[vec_ind] = dz;
}
__m256 dz_reg = _mm256_load_ps( (float *)phase_vec );
dz_reg = _mm256_complexnormalise_ps( dz_reg );
__m256 dz_reg = _mm256_load_ps((float*)phase_vec);
dz_reg = _mm256_complexnormalise_ps(dz_reg);
for(;number < sixteenthPoints; number++)
for (; number < sixteenthPoints; number++)
{
a0Val = _mm256_loadu_ps(aPtr);
a1Val = _mm256_loadu_ps(aPtr+8);
a2Val = _mm256_loadu_ps(aPtr+16);
a3Val = _mm256_loadu_ps(aPtr+24);
a1Val = _mm256_loadu_ps(aPtr + 8);
a2Val = _mm256_loadu_ps(aPtr + 16);
a3Val = _mm256_loadu_ps(aPtr + 24);
a0Val = _mm256_complexmul_ps( a0Val, z0 );
a1Val = _mm256_complexmul_ps( a1Val, z1 );
a2Val = _mm256_complexmul_ps( a2Val, z2 );
a3Val = _mm256_complexmul_ps( a3Val, z3 );
a0Val = _mm256_complexmul_ps(a0Val, z0);
a1Val = _mm256_complexmul_ps(a1Val, z1);
a2Val = _mm256_complexmul_ps(a2Val, z2);
a3Val = _mm256_complexmul_ps(a3Val, z3);
z0 = _mm256_complexmul_ps( z0, dz_reg );
z1 = _mm256_complexmul_ps( z1, dz_reg );
z2 = _mm256_complexmul_ps( z2, dz_reg );
z3 = _mm256_complexmul_ps( z3, dz_reg );
z0 = _mm256_complexmul_ps(z0, dz_reg);
z1 = _mm256_complexmul_ps(z1, dz_reg);
z2 = _mm256_complexmul_ps(z2, dz_reg);
z3 = _mm256_complexmul_ps(z3, dz_reg);
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8);
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8);
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
// TODO: it may be possible to rearrange swizzling to better pipeline data
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
@ -274,43 +279,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
// Force the rotators back onto the unit circle
if ((number % 64) == 0)
{
z0 = _mm256_complexnormalise_ps( z0 );
z1 = _mm256_complexnormalise_ps( z1 );
z2 = _mm256_complexnormalise_ps( z2 );
z3 = _mm256_complexnormalise_ps( z3 );
z0 = _mm256_complexnormalise_ps(z0);
z1 = _mm256_complexnormalise_ps(z1);
z2 = _mm256_complexnormalise_ps(z2);
z3 = _mm256_complexnormalise_ps(z3);
}
aPtr += 32;
}
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
_mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
result[ vec_ind ] = lv_cmake( 0, 0 );
for( i = 0; i < 4; ++i )
result[vec_ind] = lv_cmake(0, 0);
for (i = 0; i < 4; ++i)
{
result[vec_ind] += dotProductVector[i];
}
}
z0 = _mm256_complexnormalise_ps( z0 );
z0 = _mm256_complexnormalise_ps(z0);
_mm256_store_ps((float*)phase_vec, z0);
_phase = phase_vec[0];
_mm256_zeroupper();
number = sixteenthPoints*16;
for(;number < num_points; number++)
number = sixteenthPoints * 16;
for (; number < num_points; number++)
{
wo = (*aPtr++)*_phase;
wo = (*aPtr++) * _phase;
_phase *= phase_inc;
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{
result[vec_ind] += wo * in_a[vec_ind][number];
}
@ -333,8 +339,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
const unsigned int sixteenthPoints = num_points / 16;
const float* aPtr = (float*)in_common;
const float* bPtr[ num_a_vectors];
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
const float* bPtr[num_a_vectors];
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{
bPtr[vec_ind] = in_a[vec_ind];
}
@ -352,7 +358,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
__m256 dotProdVal2[num_a_vectors];
__m256 dotProdVal3[num_a_vectors];
for( vec_ind = 0; vec_ind < num_a_vectors; vec_ind++ )
for (vec_ind = 0; vec_ind < num_a_vectors; vec_ind++)
{
dotProdVal0[vec_ind] = _mm256_setzero_ps();
dotProdVal1[vec_ind] = _mm256_setzero_ps();
@ -362,58 +368,62 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
// Set up the complex rotator
__m256 z0, z1, z2, z3;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16];
for( vec_ind = 0; vec_ind < 16; ++vec_ind )
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t phase_vec[16];
for (vec_ind = 0; vec_ind < 16; ++vec_ind)
{
phase_vec[vec_ind] = _phase;
_phase *= phase_inc;
}
z0 = _mm256_load_ps( (float *)phase_vec );
z1 = _mm256_load_ps( (float *)(phase_vec + 4) );
z2 = _mm256_load_ps( (float *)(phase_vec + 8) );
z3 = _mm256_load_ps( (float *)(phase_vec + 12) );
z0 = _mm256_load_ps((float*)phase_vec);
z1 = _mm256_load_ps((float*)(phase_vec + 4));
z2 = _mm256_load_ps((float*)(phase_vec + 8));
z3 = _mm256_load_ps((float*)(phase_vec + 12));
lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16;
lv_32fc_t dz = phase_inc;
dz *= dz;
dz *= dz;
dz *= dz;
dz *= dz; // dz = phase_inc^16;
for( vec_ind = 0; vec_ind < 4; ++vec_ind )
for (vec_ind = 0; vec_ind < 4; ++vec_ind)
{
phase_vec[vec_ind] = dz;
}
__m256 dz_reg = _mm256_load_ps( (float *)phase_vec );
dz_reg = _mm256_complexnormalise_ps( dz_reg );
__m256 dz_reg = _mm256_load_ps((float*)phase_vec);
dz_reg = _mm256_complexnormalise_ps(dz_reg);
for(;number < sixteenthPoints; number++)
for (; number < sixteenthPoints; number++)
{
a0Val = _mm256_load_ps(aPtr);
a1Val = _mm256_load_ps(aPtr+8);
a2Val = _mm256_load_ps(aPtr+16);
a3Val = _mm256_load_ps(aPtr+24);
a1Val = _mm256_load_ps(aPtr + 8);
a2Val = _mm256_load_ps(aPtr + 16);
a3Val = _mm256_load_ps(aPtr + 24);
a0Val = _mm256_complexmul_ps( a0Val, z0 );
a1Val = _mm256_complexmul_ps( a1Val, z1 );
a2Val = _mm256_complexmul_ps( a2Val, z2 );
a3Val = _mm256_complexmul_ps( a3Val, z3 );
a0Val = _mm256_complexmul_ps(a0Val, z0);
a1Val = _mm256_complexmul_ps(a1Val, z1);
a2Val = _mm256_complexmul_ps(a2Val, z2);
a3Val = _mm256_complexmul_ps(a3Val, z3);
z0 = _mm256_complexmul_ps( z0, dz_reg );
z1 = _mm256_complexmul_ps( z1, dz_reg );
z2 = _mm256_complexmul_ps( z2, dz_reg );
z3 = _mm256_complexmul_ps( z3, dz_reg );
z0 = _mm256_complexmul_ps(z0, dz_reg);
z1 = _mm256_complexmul_ps(z1, dz_reg);
z2 = _mm256_complexmul_ps(z2, dz_reg);
z3 = _mm256_complexmul_ps(z3, dz_reg);
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]+8);
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
x0Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind]); // t0|t1|t2|t3|t4|t5|t6|t7
x1Val[vec_ind] = _mm256_loadu_ps(bPtr[vec_ind] + 8);
x0loVal[vec_ind] = _mm256_unpacklo_ps(x0Val[vec_ind], x0Val[vec_ind]); // t0|t0|t1|t1|t4|t4|t5|t5
x0hiVal[vec_ind] = _mm256_unpackhi_ps(x0Val[vec_ind], x0Val[vec_ind]); // t2|t2|t3|t3|t6|t6|t7|t7
x1loVal[vec_ind] = _mm256_unpacklo_ps(x1Val[vec_ind], x1Val[vec_ind]);
x1hiVal[vec_ind] = _mm256_unpackhi_ps(x1Val[vec_ind], x1Val[vec_ind]);
// TODO: it may be possible to rearrange swizzling to better pipeline data
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
b0Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x20); // t0|t0|t1|t1|t2|t2|t3|t3
b1Val[vec_ind] = _mm256_permute2f128_ps(x0loVal[vec_ind], x0hiVal[vec_ind], 0x31); // t4|t4|t5|t5|t6|t6|t7|t7
b2Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x20);
b3Val[vec_ind] = _mm256_permute2f128_ps(x1loVal[vec_ind], x1hiVal[vec_ind], 0x31);
@ -433,43 +443,44 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
// Force the rotators back onto the unit circle
if ((number % 64) == 0)
{
z0 = _mm256_complexnormalise_ps( z0 );
z1 = _mm256_complexnormalise_ps( z1 );
z2 = _mm256_complexnormalise_ps( z2 );
z3 = _mm256_complexnormalise_ps( z3 );
z0 = _mm256_complexnormalise_ps(z0);
z1 = _mm256_complexnormalise_ps(z1);
z2 = _mm256_complexnormalise_ps(z2);
z3 = _mm256_complexnormalise_ps(z3);
}
aPtr += 32;
}
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal1[vec_ind]);
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal2[vec_ind]);
dotProdVal0[vec_ind] = _mm256_add_ps(dotProdVal0[vec_ind], dotProdVal3[vec_ind]);
_mm256_store_ps((float *)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
_mm256_store_ps((float*)dotProductVector, dotProdVal0[vec_ind]); // Store the results back into the dot product vector
result[ vec_ind ] = lv_cmake( 0, 0 );
for( i = 0; i < 4; ++i )
result[vec_ind] = lv_cmake(0, 0);
for (i = 0; i < 4; ++i)
{
result[vec_ind] += dotProductVector[i];
}
}
z0 = _mm256_complexnormalise_ps( z0 );
z0 = _mm256_complexnormalise_ps(z0);
_mm256_store_ps((float*)phase_vec, z0);
_phase = phase_vec[0];
_mm256_zeroupper();
number = sixteenthPoints*16;
for(;number < num_points; number++)
number = sixteenthPoints * 16;
for (; number < num_points; number++)
{
wo = (*aPtr++)*_phase;
wo = (*aPtr++) * _phase;
_phase *= phase_inc;
for( vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind )
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{
result[vec_ind] += wo * in_a[vec_ind][number];
}
@ -482,5 +493,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
#endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */

View File

@ -42,7 +42,7 @@
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -53,15 +53,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
}
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic(lv
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -82,15 +82,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
}
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_generic_re
#endif // Generic
#ifdef LV_HAVE_AVX
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -111,15 +111,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
}
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -130,7 +130,7 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_u_avx(lv_3
#ifdef LV_HAVE_AVX
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const float* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -141,15 +141,15 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
phase_inc[0] = lv_cmake(cos(phase_step_rad), sin(phase_step_rad));
unsigned int n;
int num_a_vectors = 3;
float ** in_a = (float **)volk_gnsssdr_malloc(sizeof(float *) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
float** in_a = (float**)volk_gnsssdr_malloc(sizeof(float*) * num_a_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (float *)volk_gnsssdr_malloc(sizeof(float ) * num_points, volk_gnsssdr_get_alignment());
in_a[n] = (float*)volk_gnsssdr_malloc(sizeof(float) * num_points, volk_gnsssdr_get_alignment());
memcpy((float*)in_a[n], (float*)in, sizeof(float) * num_points);
}
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const float**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -159,4 +159,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
#endif // AVX
#endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H

View File

@ -80,10 +80,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip
@ -99,12 +101,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
outputVectorPtr += 8;
}
for(i = sse_iters * 8; i < num_points * 2; i++)
for (i = sse_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
@ -128,15 +130,17 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
const float max_val = (float)SHRT_MAX;
__m128 inputVal1, inputVal2;
__m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h?
__m128i intInputVal1, intInputVal2; // is __m128i defined in xmmintrin.h?
__m128 ret1, ret2;
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0;i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip
@ -152,12 +156,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
outputVectorPtr += 8;
}
for(i = sse_iters * 8; i < num_points*2; i++)
for (i = sse_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
@ -175,7 +179,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2;
@ -184,10 +188,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
for (i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip
@ -204,12 +210,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
outputVectorPtr += 16;
}
for(i = avx2_iters * 16; i < num_points * 2; i++)
for (i = avx2_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
@ -238,10 +244,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip
@ -257,12 +265,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
outputVectorPtr += 8;
}
for(i = sse_iters * 8; i < num_points * 2; i++)
for (i = sse_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
@ -289,10 +297,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip
@ -308,12 +318,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
outputVectorPtr += 8;
}
for(i = sse_iters * 8; i < num_points * 2; i++)
for (i = sse_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
@ -332,7 +342,7 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
int16_t* outputVectorPtr = (int16_t*)outputVector;
float aux;
unsigned int i;
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float min_val = (float)SHRT_MIN; ///todo Something off here, compiler does not perform right cast
const float max_val = (float)SHRT_MAX;
__m256 inputVal1, inputVal2;
@ -341,10 +351,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
for (i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip
@ -361,12 +373,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
outputVectorPtr += 16;
}
for(i = avx2_iters * 16; i < num_points * 2; i++)
for (i = avx2_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
@ -397,10 +409,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
int16x4_t intInputVal1, intInputVal2;
int16x8_t res;
for(i = 0; i < neon_iters; i++)
for (i = 0; i < neon_iters; i++)
{
a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4;
a = vld1q_f32((const float32_t*)(inputVectorPtr));
inputVectorPtr += 4;
b = vld1q_f32((const float32_t*)(inputVectorPtr));
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
@ -425,12 +439,12 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
outputVectorPtr += 8;
}
for(i = neon_iters * 8; i < num_points * 2; i++)
for (i = neon_iters * 8; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val_f)
if (aux > max_val_f)
aux = max_val_f;
else if(aux < min_val_f)
else if (aux < min_val_f)
aux = min_val_f;
*outputVectorPtr++ = (int16_t)rintf(aux);
}
@ -449,14 +463,14 @@ static inline void volk_gnsssdr_32fc_convert_16ic_generic(lv_16sc_t* outputVecto
const float max_val = (float)SHRT_MAX;
float aux;
unsigned int i;
for(i = 0; i < num_points * 2; i++)
for (i = 0; i < num_points * 2; i++)
{
aux = *inputVectorPtr++;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int16_t)rintf(aux);
*outputVectorPtr++ = (int16_t)rintf(aux);
}
}
#endif /* LV_HAVE_GENERIC */

View File

@ -72,12 +72,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_generic(lv_8sc_t* outputVector,
const float max_val = (float)SCHAR_MAX;
float aux;
unsigned int i;
for(i = 0; i < num_points * 2; i++)
for (i = 0; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
@ -107,12 +107,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector,
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
for (i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
@ -142,12 +146,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector,
outputVectorPtr += 32;
}
for(i = avx2_iters * 32; i < num_points * 2; i++)
for (i = avx2_iters * 32; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
@ -177,12 +181,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector,
const __m256 vmin_val = _mm256_set1_ps(min_val);
const __m256 vmax_val = _mm256_set1_ps(max_val);
for(i = 0; i < avx2_iters; i++)
for (i = 0; i < avx2_iters; i++)
{
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8;
inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal3 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal4 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
@ -212,12 +220,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector,
outputVectorPtr += 32;
}
for(i = avx2_iters * 32; i < num_points * 2; i++)
for (i = avx2_iters * 32; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
@ -247,12 +255,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
@ -278,12 +290,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
outputVectorPtr += 16;
}
for(i = sse_iters * 16; i < num_points * 2; i++)
for (i = sse_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
@ -313,12 +325,16 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
const __m128 vmin_val = _mm_set_ps1(min_val);
const __m128 vmax_val = _mm_set_ps1(max_val);
for(i = 0; i < sse_iters; i++)
for (i = 0; i < sse_iters; i++)
{
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4;
inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
@ -344,12 +360,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
outputVectorPtr += 16;
}
for(i = sse_iters * 16; i < num_points * 2; i++)
for (i = sse_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val;
if(aux > max_val)
if (aux > max_val)
aux = max_val;
else if(aux < min_val)
else if (aux < min_val)
aux = min_val;
*outputVectorPtr++ = (int8_t)rintf(aux);
}
@ -383,9 +399,10 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
int8x8_t res8_1, res8_2;
int8x16_t outputVal;
for(i = 0; i < neon_iters; i++)
for (i = 0; i < neon_iters; i++)
{
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -394,7 +411,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
toint_a = vcvtq_s32_f32(Round);
intInputVal1 = vqmovn_s32(toint_a);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -406,7 +424,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2);
res8_1 = vqmovn_s16(pack16_8_1);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -415,7 +434,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
toint_a = vcvtq_s32_f32(Round);
intInputVal1 = vqmovn_s32(toint_a);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4;
a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -433,12 +453,12 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
outputVectorPtr += 16;
}
for(i = neon_iters * 16; i < num_points * 2; i++)
for (i = neon_iters * 16; i < num_points * 2; i++)
{
aux = *inputVectorPtr++ * max_val_f;
if(aux > max_val_f)
if (aux > max_val_f)
aux = max_val_f;
else if(aux < min_val_f)
else if (aux < min_val_f)
aux = min_val_f;
*outputVectorPtr++ = (int8_t)rintf(aux);
}

View File

@ -42,31 +42,30 @@
#include <string.h>
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_generic(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -78,26 +77,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* r
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -107,26 +106,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse3(lv_32fc_t* re
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -137,26 +136,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse3(lv_32fc_t* re
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -166,26 +165,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_sse4_1(lv_32fc_t*
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
@ -195,26 +194,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_sse4_1(lv_32fc_t*
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
@ -224,26 +223,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx(lv_32fc_t* res
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
@ -253,26 +252,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx(lv_32fc_t* res
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
@ -282,26 +281,26 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_a_avx2(lv_32fc_t* re
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
@ -311,28 +310,28 @@ static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_u_avx2(lv_32fc_t* re
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{
int code_length_chips = 2046;
float code_phase_step_chips = ((float)(code_length_chips) + 0.1 )/( (float) num_points );
float code_phase_step_chips = ((float)(code_length_chips) + 0.1) / ((float)num_points);
int num_out_vectors = 3;
float rem_code_phase_chips = -0.234;
unsigned int n;
float shifts_chips[3] = { -0.1, 0.0, 0.1 };
float shifts_chips[3] = {-0.1, 0.0, 0.1};
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
lv_32fc_t** result_aux = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_out_vectors, volk_gnsssdr_get_alignment());
for (n = 0; n < num_out_vectors; n++)
{
result_aux[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
}
volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(result_aux, local_code, rem_code_phase_chips, code_phase_step_chips, shifts_chips, code_length_chips, num_out_vectors, num_points);
memcpy((lv_32fc_t*)result, (lv_32fc_t*)result_aux[0], sizeof(lv_32fc_t) * num_points);
for(n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
for (n = 0; n < num_out_vectors; n++)
{
volk_gnsssdr_free(result_aux[n]);
}
volk_gnsssdr_free(result_aux);
}
#endif
#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H
#endif // INCLUDED_volk_gnsssdr_32fc_resamplerpuppet_32fc_H

View File

@ -85,11 +85,11 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic(lv_32fc
unsigned int n;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
result[n_vec] = lv_cmake(0, 0);
}
for (n = 0; n < num_points; n++)
{
tmp32_1 = *in_common++ * (*phase);//if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
tmp32_1 = *in_common++ * (*phase); //if(n<10 || n >= 8108) printf("generic phase %i: %f,%f\n", n,lv_creal(*phase),lv_cimag(*phase));
// Regenerate phase
if (n % 256 == 0)
@ -126,7 +126,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
unsigned int j;
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
result[n_vec] = lv_cmake(0,0);
result[n_vec] = lv_cmake(0, 0);
}
for (n = 0; n < num_points / ROTATOR_RELOAD; n++)
@ -141,7 +141,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
result[n_vec] += tmp32_2;
}
}
/* Regenerate phase */
/* Regenerate phase */
#ifdef __cplusplus
(*phase) /= std::abs((*phase));
#else
@ -169,7 +169,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(
#include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2;
int n_vec;
@ -179,7 +179,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t dotProductVector[2];
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
@ -191,11 +192,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
// phase rotation registers
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -203,12 +206,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
// Phase rotation on operand in_common starts here:
a = _mm_loadu_ps((float*)_in_common);
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg);
tmp1 = _mm_mul_ps(a, yl);
tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp);
@ -219,7 +222,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
z1 = _mm_addsub_ps(tmp1, tmp2);
two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p);
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(z1);
//next two samples
@ -227,7 +230,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_loadu_ps((float*)&(_in_a[n_vec][number*2]));
a = _mm_loadu_ps((float*)&(_in_a[n_vec][number * 2]));
tmp1 = _mm_mul_ps(a, yl);
a = _mm_shuffle_ps(a, a, 0xB1);
tmp2 = _mm_mul_ps(a, yh);
@ -247,8 +250,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (i = 0; i < 2; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -260,7 +263,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for(n = sse_iters * 2; n < num_points; n++)
for (n = sse_iters * 2; n < num_points; n++)
{
tmp32_1 = in_common[n] * (*phase);
(*phase) *= phase_inc;
@ -278,7 +281,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
#include <pmmintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int sse_iters = num_points / 2;
int n_vec;
@ -288,7 +291,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2];
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t dotProductVector[2];
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
@ -300,11 +304,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
// phase rotation registers
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2];
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*) two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2];
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -312,12 +318,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
const __m128 ylp = _mm_moveldup_ps(two_phase_inc_reg);
const __m128 yhp = _mm_movehdup_ps(two_phase_inc_reg);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
// Phase rotation on operand in_common starts here:
a = _mm_load_ps((float*)_in_common);
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
// __VOLK_GNSSSDR_PREFETCH(_in_common + 4);
yl = _mm_moveldup_ps(two_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(two_phase_acc_reg);
tmp1 = _mm_mul_ps(a, yl);
tmp1p = _mm_mul_ps(two_phase_acc_reg, ylp);
@ -328,7 +334,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
z1 = _mm_addsub_ps(tmp1, tmp2);
two_phase_acc_reg = _mm_addsub_ps(tmp1p, tmp2p);
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
yl = _mm_moveldup_ps(z1); // Load yl with cr,cr,dr,dr
yh = _mm_movehdup_ps(z1);
//next two samples
@ -336,7 +342,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
a = _mm_load_ps((float*)&(_in_a[n_vec][number*2]));
a = _mm_load_ps((float*)&(_in_a[n_vec][number * 2]));
tmp1 = _mm_mul_ps(a, yl);
a = _mm_shuffle_ps(a, a, 0xB1);
tmp2 = _mm_mul_ps(a, yh);
@ -356,8 +362,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
_mm_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (i = 0; i < 2; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -369,7 +375,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
_mm_store_ps((float*)two_phase_acc, two_phase_acc_reg);
(*phase) = two_phase_acc[0];
for(n = sse_iters * 2; n < num_points; n++)
for (n = sse_iters * 2; n < num_points; n++)
{
tmp32_1 = in_common[n] * (*phase);
(*phase) *= phase_inc;
@ -387,7 +393,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4;
int n_vec;
@ -398,7 +404,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
const lv_32fc_t* _in_common = in_common;
lv_32fc_t _phase = (*phase);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
@ -431,12 +438,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
for(number = 0; number < avx_iters; number++)
for (number = 0; number < avx_iters; number++)
{
// Phase rotation on operand in_common starts here:
a = _mm256_loadu_ps((float*)_in_common);
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(four_phase_acc_reg);
tmp1 = _mm256_mul_ps(a, yl);
tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp);
@ -447,7 +454,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
z = _mm256_addsub_ps(tmp1, tmp2);
four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p);
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(z);
//next two samples
@ -475,8 +482,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -492,10 +499,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2);
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0];
_phase = four_phase_acc[0];
_mm256_zeroupper();
for(n = avx_iters * 4; n < num_points; n++)
for (n = avx_iters * 4; n < num_points; n++)
{
tmp32_1 = *_in_common++ * _phase;
_phase *= phase_inc;
@ -514,7 +521,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
#include <immintrin.h>
static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t* result, const lv_32fc_t* in_common, const lv_32fc_t phase_inc, lv_32fc_t* phase, const lv_32fc_t** in_a, int num_a_vectors, unsigned int num_points)
{
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t dotProduct = lv_cmake(0, 0);
lv_32fc_t tmp32_1, tmp32_2;
const unsigned int avx_iters = num_points / 4;
int n_vec;
@ -525,7 +532,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
const lv_32fc_t* _in_common = in_common;
lv_32fc_t _phase = (*phase);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
@ -538,7 +546,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
// phase rotation registers
__m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_inc[4];
const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
@ -548,7 +557,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
four_phase_inc[3] = phase_inc4;
const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_acc[4];
four_phase_acc[0] = _phase;
four_phase_acc[1] = _phase * phase_inc;
four_phase_acc[2] = _phase * phase_inc2;
@ -558,12 +568,12 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
const __m256 ylp = _mm256_moveldup_ps(four_phase_inc_reg);
const __m256 yhp = _mm256_movehdup_ps(four_phase_inc_reg);
for(number = 0; number < avx_iters; number++)
for (number = 0; number < avx_iters; number++)
{
// Phase rotation on operand in_common starts here:
a = _mm256_load_ps((float*)_in_common);
__VOLK_GNSSSDR_PREFETCH(_in_common + 16);
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
yl = _mm256_moveldup_ps(four_phase_acc_reg); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(four_phase_acc_reg);
tmp1 = _mm256_mul_ps(a, yl);
tmp1p = _mm256_mul_ps(four_phase_acc_reg, ylp);
@ -574,7 +584,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
z = _mm256_addsub_ps(tmp1, tmp2);
four_phase_acc_reg = _mm256_addsub_ps(tmp1p, tmp2p);
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
yl = _mm256_moveldup_ps(z); // Load yl with cr,cr,dr,dr
yh = _mm256_movehdup_ps(z);
//next two samples
@ -602,8 +612,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
_mm256_store_ps((float*)dotProductVector, acc[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -619,10 +629,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
four_phase_acc_reg = _mm256_div_ps(four_phase_acc_reg, tmp2);
_mm256_store_ps((float*)four_phase_acc, four_phase_acc_reg);
_phase = four_phase_acc[0];
_phase = four_phase_acc[0];
_mm256_zeroupper();
for(n = avx_iters * 4; n < num_points; n++)
for (n = avx_iters * 4; n < num_points; n++)
{
tmp32_1 = *_in_common++ * _phase;
_phase *= phase_inc;
@ -646,7 +656,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
int n_vec;
int i;
unsigned int number;
unsigned int n ;
unsigned int n;
const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common;
lv_32fc_t* _out = result;
@ -656,36 +666,41 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
if (neon_iters > 0)
{
lv_32fc_t dotProduct = lv_cmake(0,0);
lv_32fc_t dotProduct = lv_cmake(0, 0);
float32_t arg_phase0 = cargf(_phase);
float32_t arg_phase_inc = cargf(phase_inc);
float32_t phase_est;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
lv_32fc_t phase2 = (lv_32fc_t)(_phase) * phase_inc;
lv_32fc_t phase2 = (lv_32fc_t)(_phase)*phase_inc;
lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag;
float32x4x2_t* accumulator1 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
float32x4x2_t* accumulator2 = (float32x4x2_t*)volk_gnsssdr_malloc(num_a_vectors * sizeof(float32x4x2_t), volk_gnsssdr_get_alignment());
for(n_vec = 0; n_vec < num_a_vectors; n_vec++)
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
accumulator1[n_vec].val[0] = vdupq_n_f32(0.0f);
accumulator1[n_vec].val[1] = vdupq_n_f32(0.0f);
@ -693,7 +708,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
accumulator2[n_vec].val[1] = vdupq_n_f32(0.0f);
}
for(number = 0; number < neon_iters; number++)
for (number = 0; number < neon_iters; number++)
{
/* load 4 complex numbers (float 32 bits each component) */
b_val = vld2q_f32((float32_t*)_in_common);
@ -728,8 +743,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) };
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) };
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag);
@ -753,8 +770,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
}
for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{
vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0,0);
vst2q_f32((float32_t*)dotProductVector, accumulator1[n_vec]); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0);
for (i = 0; i < 4; ++i)
{
dotProduct = dotProduct + dotProductVector[i];
@ -770,7 +787,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
_phase = lv_cmake((float32_t)__phase_real[0], (float32_t)__phase_imag[0]);
}
for(n = neon_iters * 4; n < num_points; n++)
for (n = neon_iters * 4; n < num_points; n++)
{
tmp32_1 = in_common[n] * _phase;
_phase *= phase_inc;
@ -786,4 +803,3 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */

View File

@ -41,7 +41,7 @@
#include <string.h>
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -53,14 +53,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -71,7 +71,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic(lv_
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_reload(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -83,14 +83,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_generic_reload(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -101,7 +101,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_generic_rel
#ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -113,14 +113,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -131,7 +131,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_sse3(lv_3
#ifdef LV_HAVE_SSE3
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -143,14 +143,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -161,7 +161,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_sse3(lv_3
#ifdef LV_HAVE_AVX
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -173,14 +173,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -191,7 +191,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_u_avx(lv_32
#ifdef LV_HAVE_AVX
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -203,14 +203,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}
@ -221,7 +221,7 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_a_avx(lv_32
#ifdef LV_HAVE_NEON
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32fc_t* result, const lv_32fc_t* local_code, const lv_32fc_t* in, unsigned int num_points)
{
// phases must be normalized. Phase rotator expects a complex exponential input!
float rem_carrier_phase_in_rad = 0.25;
@ -233,14 +233,14 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc_neon(lv_32f
unsigned int n;
int num_a_vectors = 3;
lv_32fc_t** in_a = (lv_32fc_t**)volk_gnsssdr_malloc(sizeof(lv_32fc_t*) * num_a_vectors, volk_gnsssdr_get_alignment());
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
in_a[n] = (lv_32fc_t*)volk_gnsssdr_malloc(sizeof(lv_32fc_t) * num_points, volk_gnsssdr_get_alignment());
memcpy((lv_32fc_t*)in_a[n], (lv_32fc_t*)in, sizeof(lv_32fc_t) * num_points);
}
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**) in_a, num_a_vectors, num_points);
volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(result, local_code, phase_inc[0], phase, (const lv_32fc_t**)in_a, num_a_vectors, num_points);
for(n = 0; n < num_a_vectors; n++)
for (n = 0; n < num_a_vectors; n++)
{
volk_gnsssdr_free(in_a[n]);
}

View File

@ -107,7 +107,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -121,7 +122,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -142,18 +143,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -177,7 +178,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -191,7 +193,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -212,18 +214,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -245,7 +247,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -259,7 +262,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -277,18 +280,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -311,7 +314,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128();
@ -325,7 +329,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
shifts_chips_reg = _mm_set_ps1((float)shifts_chips[current_correlator_tap]);
aux2 = _mm_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
__m128 indexn = _mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f);
for(n = 0; n < quarterPoints; n++)
for (n = 0; n < quarterPoints; n++)
{
aux = _mm_mul_ps(code_phase_step_chips_reg, indexn);
aux = _mm_add_ps(aux, aux2);
@ -343,18 +347,18 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
aux_i = _mm_and_si128(code_length_chips_reg_i, negatives);
local_code_chip_index_reg = _mm_add_epi32(local_code_chip_index_reg, aux_i);
_mm_store_si128((__m128i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = _mm_add_ps(indexn, fours);
}
for(n = quarterPoints * 4; n < num_points; n++)
for (n = quarterPoints * 4; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -377,7 +381,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -392,7 +397,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -410,13 +415,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -426,12 +431,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -454,7 +459,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -469,7 +475,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -487,13 +493,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -503,12 +509,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -531,7 +537,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -546,7 +553,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -565,13 +572,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -581,12 +588,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -609,7 +616,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8];
__VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps();
@ -624,7 +632,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
shifts_chips_reg = _mm256_set1_ps((float)shifts_chips[current_correlator_tap]);
aux2 = _mm256_sub_ps(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < avx_iters; n++)
for (n = 0; n < avx_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][8 * n + 7], 1, 0);
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&local_code_chip_index[8], 1, 3);
@ -643,13 +651,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
// no negatives
c = _mm256_cvtepi32_ps(local_code_chip_index_reg);
negatives = _mm256_cmp_ps(c, zeros, 0x01 );
negatives = _mm256_cmp_ps(c, zeros, 0x01);
aux3 = _mm256_and_ps(code_length_chips_reg_f, negatives);
aux = _mm256_add_ps(c, aux3);
local_code_chip_index_reg = _mm256_cvttps_epi32(aux);
_mm256_store_si256((__m256i*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 8; ++k)
for (k = 0; k < 8; ++k)
{
_result[current_correlator_tap][n * 8 + k] = local_code[local_code_chip_index[k]];
}
@ -659,12 +667,12 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
_mm256_zeroupper();
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
{
for(n = avx_iters * 8; n < num_points; n++)
for (n = avx_iters * 8; n < num_points; n++)
{
// resample code for current tap
local_code_chip_index_ = (int)floor(code_phase_step_chips * (float)n + shifts_chips[current_correlator_tap] - rem_code_phase_chips);
//Take into account that in multitap correlators, the shifts can be negative!
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1) ;
if (local_code_chip_index_ < 0) local_code_chip_index_ += (int)code_length_chips * (abs(local_code_chip_index_) / code_length_chips + 1);
local_code_chip_index_ = local_code_chip_index_ % code_length_chips;
_result[current_correlator_tap][n] = local_code[local_code_chip_index_];
}
@ -689,19 +697,21 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4];
__VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0);
const float32x4_t code_length_chips_reg_f = vdupq_n_f32((float)code_length_chips);
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f };
__VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); // this refinement is required!
float32x4_t n0 = vld1q_f32((float*)vec);
for (current_correlator_tap = 0; current_correlator_tap < num_out_vectors; current_correlator_tap++)
@ -709,7 +719,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
shifts_chips_reg = vdupq_n_f32((float)shifts_chips[current_correlator_tap]);
aux2 = vsubq_f32(shifts_chips_reg, rem_code_phase_chips_reg);
indexn = n0;
for(n = 0; n < neon_iters; n++)
for (n = 0; n < neon_iters; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][4 * n + 3], 1, 0);
__VOLK_GNSSSDR_PREFETCH(&local_code_chip_index[4]);
@ -725,7 +735,7 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
// fmod
c = vmulq_f32(aux, reciprocal);
i = vcvtq_s32_f32(c);
i = vcvtq_s32_f32(c);
cTrunc = vcvtq_f32_s32(i);
base = vmulq_f32(cTrunc, code_length_chips_reg_f);
aux = vsubq_f32(aux, base);
@ -737,13 +747,13 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
vst1q_s32((int32_t*)local_code_chip_index, local_code_chip_index_reg);
for(k = 0; k < 4; ++k)
for (k = 0; k < 4; ++k)
{
_result[current_correlator_tap][n * 4 + k] = local_code[local_code_chip_index[k]];
}
indexn = vaddq_f32(indexn, fours);
}
for(n = neon_iters * 4; n < num_points; n++)
for (n = neon_iters * 4; n < num_points; n++)
{
__VOLK_GNSSSDR_PREFETCH_LOCALITY(&_result[current_correlator_tap][n], 1, 0);
// resample code for current tap

View File

@ -69,11 +69,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
unsigned int i;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
__VOLK_ATTR_ALIGNED(32)
double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm256_loadu_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal);
@ -82,12 +83,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
_mm256_storeu_pd((double*)tempBuffer, accumulator);
for(i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 4); ++i)
for (i = 0; i < (num_points % 4); ++i)
{
returnValue += (*aPtr++);
}
@ -100,7 +101,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const double* inputBuffer, unsigned int num_points)
static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result, const double* inputBuffer, unsigned int num_points)
{
double returnValue = 0;
const unsigned int sse_iters = num_points / 2;
@ -108,11 +109,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
unsigned int i;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
__VOLK_ATTR_ALIGNED(16)
double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm_loadu_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal);
@ -121,12 +123,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
_mm_storeu_pd((double*)tempBuffer, accumulator);
for(i = 0; i < 2; ++i)
for (i = 0; i < 2; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 2); ++i)
for (i = 0; i < (num_points % 2); ++i)
{
returnValue += (*aPtr++);
}
@ -138,13 +140,13 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const double* inputBuffer, unsigned int num_points)
static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result, const double* inputBuffer, unsigned int num_points)
{
const double* aPtr = inputBuffer;
double returnValue = 0;
unsigned int number;
for(number = 0; number < num_points; number++)
for (number = 0; number < num_points; number++)
{
returnValue += (*aPtr++);
}
@ -156,7 +158,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_generic(double* result,const
#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const double* inputBuffer, unsigned int num_points)
static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result, const double* inputBuffer, unsigned int num_points)
{
double returnValue = 0;
const unsigned int sse_iters = num_points / 4;
@ -164,11 +166,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
unsigned int i;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4];
__VOLK_ATTR_ALIGNED(32)
double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm256_load_pd(aPtr);
accumulator = _mm256_add_pd(accumulator, aVal);
@ -177,12 +180,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
_mm256_store_pd((double*)tempBuffer, accumulator);
for(i = 0; i < 4; ++i)
for (i = 0; i < 4; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 4); ++i)
for (i = 0; i < (num_points % 4); ++i)
{
returnValue += (*aPtr++);
}
@ -195,7 +198,7 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const double* inputBuffer, unsigned int num_points)
static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result, const double* inputBuffer, unsigned int num_points)
{
double returnValue = 0;
const unsigned int sse_iters = num_points / 2;
@ -203,11 +206,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
unsigned int i;
const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2];
__VOLK_ATTR_ALIGNED(16)
double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm_load_pd(aPtr);
accumulator = _mm_add_pd(accumulator, aVal);
@ -216,12 +220,12 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
_mm_store_pd((double*)tempBuffer, accumulator);
for(i = 0; i < 2; ++i)
for (i = 0; i < 2; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 2); ++i)
for (i = 0; i < (num_points % 2); ++i)
{
returnValue += (*aPtr++);
}

View File

@ -70,11 +70,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
unsigned int i;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm_lddqu_si128((__m128i*)aPtr);
accumulator = _mm_add_epi8(accumulator, aVal);
@ -82,12 +83,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
}
_mm_storeu_si128((__m128i*)tempBuffer, accumulator);
for(i = 0; i < 16; ++i)
for (i = 0; i < 16; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 16); ++i)
for (i = 0; i < (num_points % 16); ++i)
{
returnValue += (*aPtr++);
}
@ -104,7 +105,7 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_generic(char* result, const c
const char* aPtr = inputBuffer;
char returnValue = 0;
unsigned int number;
for(number = 0;number < num_points; number++)
for (number = 0; number < num_points; number++)
{
returnValue += (*aPtr++);
}
@ -125,24 +126,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm_load_si128((__m128i*)aPtr);
accumulator = _mm_add_epi8(accumulator, aVal);
aPtr += 16;
}
_mm_store_si128((__m128i*)tempBuffer,accumulator);
_mm_store_si128((__m128i*)tempBuffer, accumulator);
for(i = 0; i < 16; ++i)
for (i = 0; i < 16; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 16); ++i)
for (i = 0; i < (num_points % 16); ++i)
{
returnValue += (*aPtr++);
}
@ -164,24 +166,25 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const ch
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
__VOLK_ATTR_ALIGNED(32)
char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm256_load_si256((__m256i*)aPtr);
accumulator = _mm256_add_epi8(accumulator, aVal);
aPtr += 32;
}
_mm256_store_si256((__m256i*)tempBuffer,accumulator);
_mm256_store_si256((__m256i*)tempBuffer, accumulator);
for(i = 0; i < 32; ++i)
for (i = 0; i < 32; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 32); ++i)
for (i = 0; i < (num_points % 32); ++i)
{
returnValue += (*aPtr++);
}
@ -202,11 +205,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch
unsigned int i;
const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32];
__VOLK_ATTR_ALIGNED(32)
char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm256_lddqu_si256((__m256i*)aPtr);
accumulator = _mm256_add_epi8(accumulator, aVal);
@ -214,12 +218,12 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch
}
_mm256_storeu_si256((__m256i*)tempBuffer, accumulator);
for(i = 0; i < 32; ++i)
for (i = 0; i < 32; ++i)
{
returnValue += tempBuffer[i];
}
for(i = 0; i < (num_points % 32); ++i)
for (i = 0; i < (num_points % 32); ++i)
{
returnValue += (*aPtr++);
}

View File

@ -60,11 +60,11 @@
#ifdef LV_HAVE_AVX2
#include<immintrin.h>
#include <immintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
@ -74,14 +74,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
char max = src0[0];
unsigned int index = 0;
unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx2_iters; number++)
for (number = 0; number < avx2_iters; number++)
{
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
mask = _mm256_movemask_epi8(compareResults);
@ -94,7 +95,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
@ -108,9 +109,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
inputPtr += 32;
}
for(i = 0; i<(num_points % 32); ++i)
for (i = 0; i < (num_points % 32); ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -128,7 +129,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 32;
unsigned int number;
@ -137,33 +138,34 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i ones, compareResults, currentValues;
__m128i compareResultslo, compareResultshi, maxValues, lo, hi;
ones = _mm256_set1_epi8(0xFF);
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
currentValues = _mm256_lddqu_si256((__m256i*)inputPtr);
lo = _mm256_castsi256_si128(currentValues);
hi = _mm256_extractf128_si256(currentValues,1);
hi = _mm256_extractf128_si256(currentValues, 1);
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
//compareResults = _mm256_set_m128i(compareResultshi , compareResultslo); //not defined in some versions of immintrin.h
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo),(compareResultshi),1);
compareResults = _mm256_insertf128_si256(_mm256_castsi128_si256(compareResultslo), (compareResultshi), 1);
if (!_mm256_testc_si256(compareResults, ones))
{
_mm256_storeu_si256((__m256i*)&currentValuesBuffer, currentValues);
for(i = 0; i < 32; i++)
for (i = 0; i < 32; i++)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
@ -175,9 +177,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
inputPtr += 32;
}
for(i = 0; i<(num_points % 32); ++i)
for (i = 0; i < (num_points % 32); ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -195,7 +197,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
@ -204,14 +206,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
currentValues = _mm_lddqu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -219,9 +222,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
{
_mm_storeu_si128((__m128i*)&currentValuesBuffer, currentValues);
for(i = 0; i < 16; i++)
for (i = 0; i < 16; i++)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
@ -233,9 +236,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
inputPtr += 16;
}
for(i = 0; i<(num_points % 16); ++i)
for (i = 0; i < (num_points % 16); ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -249,11 +252,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
#ifdef LV_HAVE_SSE2
#include<emmintrin.h>
#include <emmintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
@ -263,14 +266,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
char max = src0[0];
unsigned int index = 0;
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
@ -283,7 +287,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
@ -297,9 +301,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
inputPtr += 16;
}
for(i = 0; i<(num_points % 16); ++i)
for (i = 0; i < (num_points % 16); ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -316,14 +320,14 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
char max = src0[0];
unsigned int index = 0;
unsigned int i;
for(i = 1; i < num_points; ++i)
for (i = 1; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -337,11 +341,11 @@ static inline void volk_gnsssdr_8i_index_max_16u_generic(unsigned int* target, c
#ifdef LV_HAVE_AVX2
#include<immintrin.h>
#include <immintrin.h>
static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int avx2_iters = num_points / 32;
unsigned int number;
@ -351,14 +355,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
char max = src0[0];
unsigned int index = 0;
unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx2_iters; number++)
for (number = 0; number < avx2_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_cmpgt_epi8(maxValues, currentValues);
mask = _mm256_movemask_epi8(compareResults);
@ -371,7 +376,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
@ -385,9 +390,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
inputPtr += 32;
}
for(i = 0; i<(num_points % 32); ++i)
for (i = 0; i < (num_points % 32); ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -405,7 +410,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 32;
unsigned int number;
@ -414,19 +419,20 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32];
__VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i ones, compareResults, currentValues;
__m128i compareResultslo, compareResultshi, maxValues, lo, hi;
ones = _mm256_set1_epi8(0xFF);
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
currentValues = _mm256_load_si256((__m256i*)inputPtr);
lo = _mm256_castsi256_si128(currentValues);
hi = _mm256_extractf128_si256(currentValues,1);
hi = _mm256_extractf128_si256(currentValues, 1);
compareResultslo = _mm_cmpgt_epi8(maxValues, lo);
compareResultshi = _mm_cmpgt_epi8(maxValues, hi);
@ -438,9 +444,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
{
_mm256_store_si256((__m256i*)&currentValuesBuffer, currentValues);
for(i = 0; i < 32; i++)
for (i = 0; i < 32; i++)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
@ -452,9 +458,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
inputPtr += 32;
}
for(i = 0; i<(num_points % 32); ++i)
for (i = 0; i < (num_points % 32); ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -472,7 +478,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
@ -481,14 +487,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned int index = 0;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
@ -496,9 +503,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
{
_mm_store_si128((__m128i*)&currentValuesBuffer, currentValues);
for(i = 0; i < 16; i++)
for (i = 0; i < 16; i++)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
@ -510,9 +517,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
inputPtr += 16;
}
for(i = 0; i<(num_points % 16); ++i)
for (i = 0; i < (num_points % 16); ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];
@ -530,7 +537,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
@ -540,14 +547,15 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
char max = src0[0];
unsigned int index = 0;
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
@ -560,7 +568,7 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
index = inputPtr - basePtr + i;
max = currentValuesBuffer[i];
@ -574,9 +582,9 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
inputPtr += 16;
}
for(i = 0; i<(num_points % 16); ++i)
for (i = 0; i < (num_points % 16); ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
index = i;
max = src0[i];

View File

@ -63,21 +63,22 @@
static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
__VOLK_ATTR_ALIGNED(32)
char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx_iters; number++)
for (number = 0; number < avx_iters; number++)
{
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
currentValues = _mm256_loadu_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults;
inputPtr += 32;
@ -85,17 +86,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0
_mm256_storeu_si256((__m256i*)maxValuesBuffer, maxValues);
for(i = 0; i < 32; ++i)
for (i = 0; i < 32; ++i)
{
if(maxValuesBuffer[i] > max)
if (maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(i = avx_iters * 32; i < num_points; ++i)
for (i = avx_iters * 32; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
max = src0[i];
}
@ -112,21 +113,22 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0
static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 16;
@ -134,17 +136,17 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
_mm_storeu_si128((__m128i*)maxValuesBuffer, maxValues);
for(i = 0; i < 16; ++i)
for (i = 0; i < 16; ++i)
{
if(maxValuesBuffer[i] > max)
if (maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
max = src0[i];
}
@ -157,11 +159,11 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
#ifdef LV_HAVE_SSE2
#include<emmintrin.h>
#include <emmintrin.h>
static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
@ -169,14 +171,15 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
currentValues = _mm_loadu_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
@ -189,7 +192,7 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
max = currentValuesBuffer[i];
}
@ -202,9 +205,9 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
inputPtr += 16;
}
for(i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
max = src0[i];
}
@ -220,13 +223,13 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
char max = src0[0];
unsigned int i;
for(i = 1; i < num_points; ++i)
for (i = 1; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
max = src0[i];
}
@ -243,21 +246,22 @@ static inline void volk_gnsssdr_8i_max_s8i_generic(char* target, const char* src
static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
maxValues = _mm_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 16;
@ -265,17 +269,17 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
_mm_store_si128((__m128i*)maxValuesBuffer, maxValues);
for(i = 0; i < 16; ++i)
for (i = 0; i < 16; ++i)
{
if(maxValuesBuffer[i] > max)
if (maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
max = src0[i];
}
@ -292,39 +296,40 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int avx_iters = num_points / 32;
unsigned int number;
unsigned int i;
char* inputPtr = (char*)src0;
char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32];
__VOLK_ATTR_ALIGNED(32)
char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max);
for(number = 0; number < avx_iters; number++)
for (number = 0; number < avx_iters; number++)
{
currentValues = _mm256_load_si256((__m256i*)inputPtr);
currentValues = _mm256_load_si256((__m256i*)inputPtr);
compareResults = _mm256_max_epi8(maxValues, currentValues);
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
maxValues = compareResults; //_mm256_blendv_epi8(currentValues, maxValues, compareResults);
inputPtr += 32;
}
_mm256_store_si256((__m256i*)maxValuesBuffer, maxValues);
for(i = 0; i < 32; ++i)
for (i = 0; i < 32; ++i)
{
if(maxValuesBuffer[i] > max)
if (maxValuesBuffer[i] > max)
{
max = maxValuesBuffer[i];
}
}
for(i = avx_iters * 32; i < num_points; ++i)
for (i = avx_iters * 32; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
max = src0[i];
}
@ -341,7 +346,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0
static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0, unsigned int num_points)
{
if(num_points > 0)
if (num_points > 0)
{
const unsigned int sse_iters = num_points / 16;
unsigned int number;
@ -349,14 +354,15 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
char* inputPtr = (char*)src0;
char max = src0[0];
unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16];
__VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
currentValues = _mm_load_si128((__m128i*)inputPtr);
currentValues = _mm_load_si128((__m128i*)inputPtr);
compareResults = _mm_cmpgt_epi8(maxValues, currentValues);
mask = _mm_movemask_epi8(compareResults);
@ -369,7 +375,7 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
{
if ((mask & 1) == 1)
{
if(currentValuesBuffer[i] > max)
if (currentValuesBuffer[i] > max)
{
max = currentValuesBuffer[i];
}
@ -382,9 +388,9 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
inputPtr += 16;
}
for(i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
if(src0[i] > max)
if (src0[i] > max)
{
max = src0[i];
}

View File

@ -72,21 +72,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_sse2(char* cVector, const char* a
__m128i aVal, bVal, cVal;
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm_loadu_si128((__m128i*)aPtr);
bVal = _mm_loadu_si128((__m128i*)bPtr);
cVal = _mm_add_epi8(aVal, bVal);
_mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
_mm_storeu_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
aPtr += 16;
bPtr += 16;
cPtr += 16;
}
for(i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
@ -108,21 +108,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_u_avx2(char* cVector, const char* a
__m256i aVal, bVal, cVal;
for(number = 0; number < avx_iters; number++)
for (number = 0; number < avx_iters; number++)
{
aVal = _mm256_loadu_si256((__m256i*)aPtr);
bVal = _mm256_loadu_si256((__m256i*)bPtr);
cVal = _mm256_add_epi8(aVal, bVal);
_mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
_mm256_storeu_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
aPtr += 32;
bPtr += 32;
cPtr += 32;
}
for(i = avx_iters * 32; i < num_points; ++i)
for (i = avx_iters * 32; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
@ -139,7 +139,7 @@ static inline void volk_gnsssdr_8i_x2_add_8i_generic(char* cVector, const char*
const char* bPtr = bVector;
unsigned int number;
for(number = 0; number < num_points; number++)
for (number = 0; number < num_points; number++)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
@ -161,21 +161,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_sse2(char* cVector, const char* a
__m128i aVal, bVal, cVal;
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
aVal = _mm_load_si128((__m128i*)aPtr);
bVal = _mm_load_si128((__m128i*)bPtr);
cVal = _mm_add_epi8(aVal, bVal);
_mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
_mm_store_si128((__m128i*)cPtr, cVal); // Store the results back into the C container
aPtr += 16;
bPtr += 16;
cPtr += 16;
}
for(i = sse_iters * 16; i < num_points; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}
@ -197,21 +197,21 @@ static inline void volk_gnsssdr_8i_x2_add_8i_a_avx2(char* cVector, const char* a
__m256i aVal, bVal, cVal;
for(number = 0; number < avx_iters; number++)
for (number = 0; number < avx_iters; number++)
{
aVal = _mm256_load_si256((__m256i*)aPtr);
bVal = _mm256_load_si256((__m256i*)bPtr);
cVal = _mm256_add_epi8(aVal, bVal);
_mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
_mm256_store_si256((__m256i*)cPtr, cVal); // Store the results back into the C container
aPtr += 32;
bPtr += 32;
cPtr += 32;
}
for(i = avx_iters * 32; i < num_points; ++i)
for (i = avx_iters * 32; i < num_points; ++i)
{
*cPtr++ = (*aPtr++) + (*bPtr++);
}

View File

@ -111,10 +111,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_avx(lv_8sc_t* cVector, const
tmp = _mm256_xor_ps(tmp, conjugator1);
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
_mm256_storeu_ps((float*)c, tmp);
a += 16;
@ -155,7 +155,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_SSSE3 */
@ -188,7 +187,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_SSE3 */
@ -201,7 +199,7 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_generic(lv_8sc_t* cVector, con
const lv_8sc_t* aPtr = aVector;
unsigned int number;
for(number = 0; number < num_points; number++)
for (number = 0; number < num_points; number++)
{
*cPtr++ = lv_conj(*aPtr++);
}
@ -230,10 +228,10 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_avx(lv_8sc_t* cVector, const
tmp = _mm256_xor_ps(tmp, conjugator1);
tmp128lo = _mm256_castsi256_si128(_mm256_castps_si256(tmp));
tmp128lo = _mm_add_epi8(tmp128lo, conjugator2);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp),1);
tmp128hi = _mm256_extractf128_si256(_mm256_castps_si256(tmp), 1);
tmp128hi = _mm_add_epi8(tmp128hi, conjugator2);
//tmp = _mm256_set_m128i(tmp128hi , tmp128lo); //not defined in some versions of immintrin.h
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo),(tmp128hi),1));
tmp = _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(tmp128lo), (tmp128hi), 1));
_mm256_store_ps((float*)c, tmp);
a += 16;
@ -336,7 +334,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
{
*c++ = lv_conj(*a++);
}
}
#endif /* LV_HAVE_SSE3 */

View File

@ -78,23 +78,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_u_sse3(char* magnitudeV
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
avector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero);
avectorhi = _mm_unpackhi_epi8 (avector, zero);
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
avectorlo = _mm_unpacklo_epi8(avector, zero);
avectorhi = _mm_unpackhi_epi8(avector, zero);
avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi);
aadded = _mm_hadd_epi16(avectorlomult, avectorhimult);
complexVectorPtr += 16;
bvector = _mm_lddqu_si128((__m128i*)complexVectorPtr);
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
bvectorlo = _mm_unpacklo_epi8(bvector, zero);
bvectorhi = _mm_unpackhi_epi8(bvector, zero);
bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi);
badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult);
complexVectorPtr += 16;
@ -162,11 +162,11 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_generic(char* magnitude
const char* complexVectorPtr = (char*)complexVector;
char* magnitudeVectorPtr = magnitudeVector;
unsigned int number;
for(number = 0; number < num_points; number++)
for (number = 0; number < num_points; number++)
{
const char real = *complexVectorPtr++;
const char imag = *complexVectorPtr++;
*magnitudeVectorPtr++ = (real*real) + (imag*imag);
*magnitudeVectorPtr++ = (real * real) + (imag * imag);
}
}
#endif /* LV_HAVE_GENERIC */
@ -192,23 +192,23 @@ static inline void volk_gnsssdr_8ic_magnitude_squared_8i_a_sse3(char* magnitudeV
maska = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
maskb = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
avector = _mm_load_si128((__m128i*)complexVectorPtr);
avectorlo = _mm_unpacklo_epi8 (avector, zero);
avectorhi = _mm_unpackhi_epi8 (avector, zero);
avectorlomult = _mm_mullo_epi16 (avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16 (avectorhi, avectorhi);
aadded = _mm_hadd_epi16 (avectorlomult, avectorhimult);
avectorlo = _mm_unpacklo_epi8(avector, zero);
avectorhi = _mm_unpackhi_epi8(avector, zero);
avectorlomult = _mm_mullo_epi16(avectorlo, avectorlo);
avectorhimult = _mm_mullo_epi16(avectorhi, avectorhi);
aadded = _mm_hadd_epi16(avectorlomult, avectorhimult);
complexVectorPtr += 16;
bvector = _mm_load_si128((__m128i*)complexVectorPtr);
bvectorlo = _mm_unpacklo_epi8 (bvector, zero);
bvectorhi = _mm_unpackhi_epi8 (bvector, zero);
bvectorlomult = _mm_mullo_epi16 (bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16 (bvectorhi, bvectorhi);
badded = _mm_hadd_epi16 (bvectorlomult, bvectorhimult);
bvectorlo = _mm_unpacklo_epi8(bvector, zero);
bvectorhi = _mm_unpackhi_epi8(bvector, zero);
bvectorlomult = _mm_mullo_epi16(bvectorlo, bvectorlo);
bvectorhimult = _mm_mullo_epi16(bvectorhi, bvectorhi);
badded = _mm_hadd_epi16(bvectorlomult, bvectorhimult);
complexVectorPtr += 16;

View File

@ -80,7 +80,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
imagy = _mm_and_si128(imagy, mult1);
realy = _mm_and_si128(y, mult1);
for(; number < sse_iters; number++)
for (; number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
@ -111,7 +111,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
{
*c++ = (*a++) * scalar;
}
}
#endif /* LV_HAVE_SSE3 */
@ -173,7 +172,7 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
imagy = _mm_and_si128(imagy, mult1);
realy = _mm_and_si128(y, mult1);
for(; number < sse_iters; number++)
for (; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
@ -204,7 +203,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
{
*c++ = (*a++) * scalar;
}
}
#endif /* LV_HAVE_SSE3 */

View File

@ -75,17 +75,17 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
*cPtr += (*aPtr++) * (*bPtr++);
}*/
char * res = (char*) result;
char * in = (char*) in_a;
char * tp = (char*) in_b;
unsigned int n_2_ccomplex_blocks = num_points/2;
char* res = (char*)result;
char* in = (char*)in_a;
char* tp = (char*)in_b;
unsigned int n_2_ccomplex_blocks = num_points / 2;
unsigned int isodd = num_points & 1;
char sum0[2] = {0,0};
char sum1[2] = {0,0};
char sum0[2] = {0, 0};
char sum1[2] = {0, 0};
unsigned int i = 0;
for(i = 0; i < n_2_ccomplex_blocks; ++i)
for (i = 0; i < n_2_ccomplex_blocks; ++i)
{
sum0[0] += in[0] * tp[0] - in[1] * tp[1];
sum0[1] += in[0] * tp[1] + in[1] * tp[0];
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
res[1] = sum0[1] + sum1[1];
// Cleanup if we had an odd number of points
for(i = 0; i < isodd; ++i)
for (i = 0; i < isodd; ++i)
{
*result += in_a[num_points - 1] * in_b[num_points - 1];
}
@ -115,13 +115,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_generic(lv_8sc_t* result, co
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
memset(&dotProduct, 0x0, 2 * sizeof(char));
unsigned int number;
unsigned int i;
const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8;
const unsigned int sse_iters = num_points / 8;
if (sse_iters > 0)
{
@ -131,7 +131,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b);
@ -165,9 +165,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
totalc = _mm_or_si128(realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
__VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (i = 0; i < 8; ++i)
{
@ -192,13 +193,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
memset(&dotProduct, 0x0, 2 * sizeof(char));
unsigned int number;
unsigned int i;
const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8;
const unsigned int sse_iters = num_points / 8;
if (sse_iters > 0)
{
@ -208,7 +209,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
@ -236,13 +237,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
b += 8;
}
imagcacc = _mm_slli_si128 (imagcacc, 1);
imagcacc = _mm_slli_si128(imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
__VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (i = 0; i < 8; ++i)
{
@ -267,13 +269,13 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
memset(&dotProduct, 0x0, 2 * sizeof(char));
unsigned int number;
unsigned int i;
const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b;
const unsigned int sse_iters = num_points/8;
const unsigned int sse_iters = num_points / 8;
if (sse_iters > 0)
{
@ -283,7 +285,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -317,9 +319,10 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
totalc = _mm_or_si128(realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
__VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (i = 0; i < 8; ++i)
{
@ -343,7 +346,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{
lv_8sc_t dotProduct;
memset(&dotProduct, 0x0, 2*sizeof(char));
memset(&dotProduct, 0x0, 2 * sizeof(char));
unsigned int number;
unsigned int i;
const lv_8sc_t* a = in_a;
@ -359,7 +362,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128();
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -387,13 +390,14 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
b += 8;
}
imagcacc = _mm_slli_si128 (imagcacc, 1);
imagcacc = _mm_slli_si128(imagcacc, 1);
totalc = _mm_blendv_epi8 (imagcacc, realcacc, mult1);
totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8];
__VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
for (i = 0; i < 8; ++i)
{
@ -438,22 +442,23 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_orc(lv_8sc_t* result, cons
static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const lv_8sc_t* in_a, const lv_8sc_t* in_b, unsigned int num_points)
{
lv_8sc_t dotProduct;
dotProduct = lv_cmake(0,0);
*result = lv_cmake(0,0);
dotProduct = lv_cmake(0, 0);
*result = lv_cmake(0, 0);
const lv_8sc_t* a = in_a;
const lv_8sc_t* b = in_b;
// for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part
int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag;
__VOLK_ATTR_ALIGNED(16) lv_8sc_t accum_result[8] = { lv_cmake(0,0) };
__VOLK_ATTR_ALIGNED(16)
lv_8sc_t accum_result[8] = {lv_cmake(0, 0)};
accumulator.val[0] = vdup_n_s8(0);
accumulator.val[1] = vdup_n_s8(0);
unsigned int number;
const unsigned int neon_iters = num_points / 8;
for(number = 0; number < neon_iters; ++number)
for (number = 0; number < neon_iters; ++number)
{
a_val = vld2_s8((const int8_t*)a);
b_val = vld2_s8((const int8_t*)b);
@ -478,7 +483,7 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const
b += 8;
}
vst2_s8((int8_t*)accum_result, accumulator);
for(number = 0; number < 8; ++number)
for (number = 0; number < 8; ++number)
{
*result += accum_result[number];
}
@ -490,6 +495,6 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const
*result += dotProduct;
}
#endif /* LV_HAVE_NEON */
#endif /* LV_HAVE_NEON */
#endif /*INCLUDED_volk_gnsssdr_8ic_x2_dot_prod_8ic_H*/

View File

@ -75,7 +75,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_loadu_si128((__m128i*)a);
y = _mm_loadu_si128((__m128i*)b);
@ -133,7 +133,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_u_sse4_1(lv_8sc_t* cVector,
_mm_setzero_si128();
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
@ -181,7 +181,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_generic(lv_8sc_t* cVector, c
const lv_8sc_t* bPtr = bVector;
unsigned int number;
for(number = 0; number < num_points; number++)
for (number = 0; number < num_points; number++)
{
*cPtr++ = (*aPtr++) * (*bPtr++);
}
@ -204,7 +204,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -228,7 +228,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse2(lv_8sc_t* cVector, co
imagc = _mm_and_si128(imagc, mult1);
imagc = _mm_slli_si128(imagc, 1);
totalc = _mm_or_si128 (realc, imagc);
totalc = _mm_or_si128(realc, imagc);
_mm_store_si128((__m128i*)c, totalc);
@ -262,7 +262,7 @@ static inline void volk_gnsssdr_8ic_x2_multiply_8ic_a_sse4_1(lv_8sc_t* cVector,
_mm_setzero_si128();
mult1 = _mm_set_epi8(0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF);
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);

View File

@ -72,7 +72,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(number = 0; number < avx2_iters; number++)
for (number = 0; number < avx2_iters; number++)
{
x = _mm256_loadu_si256((__m256i*)a);
y = _mm256_loadu_si256((__m256i*)b);
@ -101,7 +101,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_avx2(unsigned char* cChar, c
c += 32;
}
for (i = avx2_iters * 32; i < num_points ; ++i)
for (i = avx2_iters * 32; i < num_points; ++i)
{
*c++ = (*a++) * (*b++);
}
@ -123,7 +123,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_lddqu_si128((__m128i*)a);
y = _mm_lddqu_si128((__m128i*)b);
@ -152,7 +152,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_u_sse3(unsigned char* cChar, c
c += 16;
}
for (i = sse_iters * 16; i < num_points ; ++i)
for (i = sse_iters * 16; i < num_points; ++i)
{
*c++ = (*a++) * (*b++);
}
@ -168,7 +168,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_generic(unsigned char* cChar,
const unsigned char* bPtr = bChar;
unsigned int number;
for(number = 0; number < num_points; number++)
for (number = 0; number < num_points; number++)
{
*cPtr++ = (*aPtr++) * (*bPtr++);
}
@ -189,7 +189,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_sse3(unsigned char* cChar, c
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(number = 0; number < sse_iters; number++)
for (number = 0; number < sse_iters; number++)
{
x = _mm_load_si128((__m128i*)a);
y = _mm_load_si128((__m128i*)b);
@ -240,7 +240,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c
const unsigned char* a = aChar;
const unsigned char* b = bChar;
for(number = 0; number < avx2_iters; number++)
for (number = 0; number < avx2_iters; number++)
{
x = _mm256_load_si256((__m256i*)a);
y = _mm256_load_si256((__m256i*)b);
@ -269,7 +269,7 @@ static inline void volk_gnsssdr_8u_x2_multiply_8u_a_avx2(unsigned char* cChar, c
c += 32;
}
for (i = avx2_iters * 32; i < num_points ; ++i)
for (i = avx2_iters * 32; i < num_points; ++i)
{
*c++ = (*a++) * (*b++);
}

View File

@ -71,9 +71,9 @@
#include <emmintrin.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{
lv_32fc_t* bPtr = out;
lv_32fc_t *bPtr = out;
const unsigned int sse_iters = num_points / 4;
unsigned int number = 0;
@ -84,44 +84,44 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
__m128i emm0, emm2, emm4;
/* declare some SSE constants */
static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
static const int _pi32_1[4] = { 1, 1, 1, 1 };
static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
static const int _pi32_2[4] = { 2, 2, 2, 2};
static const int _pi32_4[4] = { 4, 4, 4, 4};
static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
static const int _pi32_1[4] = {1, 1, 1, 1};
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
static const int _pi32_2[4] = {2, 2, 2, 2};
static const int _pi32_4[4] = {4, 4, 4, 4};
static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc };
float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc};
four_phases_reg = _mm_load_ps(four_phases);
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
for(;number < sse_iters; number++)
for (; number < sse_iters; number++)
{
x = four_phases_reg;
sign_bit_sin = x;
/* take the absolute value */
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI);
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
@ -145,9 +145,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
/* The magic pass: "Extended precision modular arithmetic”
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)_ps_minus_cephes_DP1;
xmm2 = *(__m128*)_ps_minus_cephes_DP2;
xmm3 = *(__m128*)_ps_minus_cephes_DP3;
xmm1 = *(__m128 *)_ps_minus_cephes_DP1;
xmm2 = *(__m128 *)_ps_minus_cephes_DP2;
xmm3 = *(__m128 *)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
@ -163,25 +163,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x);
y = *(__m128*)_ps_coscof_p0;
__m128 z = _mm_mul_ps(x, x);
y = *(__m128 *)_ps_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
__m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)_ps_1);
y = _mm_add_ps(y, *(__m128 *)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)_ps_sincof_p0;
__m128 y2 = *(__m128 *)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
@ -190,11 +190,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y, y2);
/* update the sign */
sine = _mm_xor_ps(xmm1, sign_bit_sin);
@ -202,19 +202,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
/* write the output */
aux = _mm_unpacklo_ps(cosine, sine);
_mm_store_ps((float*)bPtr, aux);
_mm_store_ps((float *)bPtr, aux);
bPtr += 2;
aux = _mm_unpackhi_ps(cosine, sine);
_mm_store_ps((float*)bPtr, aux);
_mm_store_ps((float *)bPtr, aux);
bPtr += 2;
four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg);
}
_phase = _phase + phase_inc * (sse_iters * 4);
for(number = sse_iters * 4; number < num_points; number++)
for (number = sse_iters * 4; number < num_points; number++)
{
*bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase)) );
*bPtr++ = lv_cmake((float)cosf((_phase)), (float)sinf((_phase)));
_phase += phase_inc;
}
(*phase) = _phase;
@ -227,9 +227,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_sse2(lv_32fc_t* out, const fl
#include <emmintrin.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/sse_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{
lv_32fc_t* bPtr = out;
lv_32fc_t *bPtr = out;
const unsigned int sse_iters = num_points / 4;
unsigned int number = 0;
@ -241,44 +241,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
__m128i emm0, emm2, emm4;
/* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
__VOLK_ATTR_ALIGNED(16)
static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 };
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f };
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(16) float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
__VOLK_ATTR_ALIGNED(16) float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc };
__VOLK_ATTR_ALIGNED(16)
float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
__VOLK_ATTR_ALIGNED(16)
float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc};
four_phases_reg = _mm_load_ps(four_phases);
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
for(;number < sse_iters; number++)
for (; number < sse_iters; number++)
{
x = four_phases_reg;
sign_bit_sin = x;
/* take the absolute value */
x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
x = _mm_and_ps(x, *(__m128 *)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask);
sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128 *)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);
y = _mm_mul_ps(x, *(__m128 *)_ps_cephes_FOPI);
/* store the integer part of y in emm2 */
emm2 = _mm_cvttps_epi32(y);
@ -302,9 +322,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
/* The magic pass: "Extended precision modular arithmetic”
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m128*)_ps_minus_cephes_DP1;
xmm2 = *(__m128*)_ps_minus_cephes_DP2;
xmm3 = *(__m128*)_ps_minus_cephes_DP3;
xmm1 = *(__m128 *)_ps_minus_cephes_DP1;
xmm2 = *(__m128 *)_ps_minus_cephes_DP2;
xmm3 = *(__m128 *)_ps_minus_cephes_DP3;
xmm1 = _mm_mul_ps(y, xmm1);
xmm2 = _mm_mul_ps(y, xmm2);
xmm3 = _mm_mul_ps(y, xmm3);
@ -320,25 +340,25 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m128 z = _mm_mul_ps(x,x);
y = *(__m128*)_ps_coscof_p0;
__m128 z = _mm_mul_ps(x, x);
y = *(__m128 *)_ps_coscof_p0;
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p1);
y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
y = _mm_add_ps(y, *(__m128 *)_ps_coscof_p2);
y = _mm_mul_ps(y, z);
y = _mm_mul_ps(y, z);
__m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
__m128 tmp = _mm_mul_ps(z, *(__m128 *)_ps_0p5);
y = _mm_sub_ps(y, tmp);
y = _mm_add_ps(y, *(__m128*)_ps_1);
y = _mm_add_ps(y, *(__m128 *)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m128 y2 = *(__m128*)_ps_sincof_p0;
__m128 y2 = *(__m128 *)_ps_sincof_p0;
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p1);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
y2 = _mm_add_ps(y2, *(__m128 *)_ps_sincof_p2);
y2 = _mm_mul_ps(y2, z);
y2 = _mm_mul_ps(y2, x);
y2 = _mm_add_ps(y2, x);
@ -347,11 +367,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
xmm3 = poly_mask;
__m128 ysin2 = _mm_and_ps(xmm3, y2);
__m128 ysin1 = _mm_andnot_ps(xmm3, y);
y2 = _mm_sub_ps(y2,ysin2);
y2 = _mm_sub_ps(y2, ysin2);
y = _mm_sub_ps(y, ysin1);
xmm1 = _mm_add_ps(ysin1,ysin2);
xmm2 = _mm_add_ps(y,y2);
xmm1 = _mm_add_ps(ysin1, ysin2);
xmm2 = _mm_add_ps(y, y2);
/* update the sign */
sine = _mm_xor_ps(xmm1, sign_bit_sin);
@ -359,19 +379,19 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
/* write the output */
aux = _mm_unpacklo_ps(cosine, sine);
_mm_storeu_ps((float*)bPtr, aux);
_mm_storeu_ps((float *)bPtr, aux);
bPtr += 2;
aux = _mm_unpackhi_ps(cosine, sine);
_mm_storeu_ps((float*)bPtr, aux);
_mm_storeu_ps((float *)bPtr, aux);
bPtr += 2;
four_phases_reg = _mm_add_ps(four_phases_reg, four_phases_inc_reg);
}
_phase = _phase + phase_inc * (sse_iters * 4);
for(number = sse_iters * 4; number < num_points; number++)
for (number = sse_iters * 4; number < num_points; number++)
{
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc;
}
(*phase) = _phase;
@ -382,13 +402,13 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
#ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{
float _phase = (*phase);
unsigned int i;
for(i = 0; i < num_points; i++)
for (i = 0; i < num_points; i++)
{
*out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
*out++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc;
}
(*phase) = _phase;
@ -400,7 +420,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic(lv_32fc_t* out, const f
#ifdef LV_HAVE_GENERIC
#include <volk_gnsssdr/volk_gnsssdr_sine_table.h>
#include <stdint.h>
static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{
float _in, s, c;
unsigned int i;
@ -413,12 +433,12 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
const int32_t diffbits = bitlength - Nbits;
uint32_t ux;
float _phase = (*phase);
for(i = 0; i < num_points; i++)
for (i = 0; i < num_points; i++)
{
_in = _phase;
d = (int32_t)floor(_in / TWO_PI + 0.5);
_in -= d * TWO_PI;
x = (int32_t) ((float)_in * TWO_TO_THE_31_DIV_PI);
x = (int32_t)((float)_in * TWO_TO_THE_31_DIV_PI);
ux = x;
sin_index = ux >> diffbits;
@ -428,7 +448,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
cos_index = ux >> diffbits;
c = sine_table_10bits[cos_index][0] * (ux >> 1) + sine_table_10bits[cos_index][1];
*out++ = lv_cmake((float)c, (float)s );
*out++ = lv_cmake((float)c, (float)s);
_phase += phase_inc;
}
(*phase) = _phase;
@ -441,9 +461,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(lv_32fc_t* out, co
#include <immintrin.h>
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{
lv_32fc_t* bPtr = out;
lv_32fc_t *bPtr = out;
const unsigned int avx_iters = num_points / 8;
unsigned int number = 0;
@ -456,44 +476,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
__m128 aux, c1, s1;
/* declare some AXX2 constants */
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
__VOLK_ATTR_ALIGNED(32)
static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(32)
static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 };
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 };
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 };
__VOLK_ATTR_ALIGNED(32)
static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
__VOLK_ATTR_ALIGNED(32)
static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc };
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc };
__VOLK_ATTR_ALIGNED(32)
float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc};
__VOLK_ATTR_ALIGNED(32)
float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc};
eight_phases_reg = _mm256_load_ps(eight_phases);
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
for(;number < avx_iters; number++)
for (; number < avx_iters; number++)
{
x = eight_phases_reg;
sign_bit_sin = x;
/* take the absolute value */
x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask);
x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask);
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI);
y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI);
/* store the integer part of y in emm2 */
emm2 = _mm256_cvttps_epi32(y);
@ -517,9 +557,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
/* The magic pass: "Extended precision modular arithmetic”
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m256*)_ps_minus_cephes_DP1;
xmm2 = *(__m256*)_ps_minus_cephes_DP2;
xmm3 = *(__m256*)_ps_minus_cephes_DP3;
xmm1 = *(__m256 *)_ps_minus_cephes_DP1;
xmm2 = *(__m256 *)_ps_minus_cephes_DP2;
xmm3 = *(__m256 *)_ps_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
@ -536,24 +576,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m256 z = _mm256_mul_ps(x, x);
y = *(__m256*)_ps_coscof_p0;
y = *(__m256 *)_ps_coscof_p0;
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1);
y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2);
y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2);
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
__m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5);
__m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5);
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(__m256*)_ps_1);
y = _mm256_add_ps(y, *(__m256 *)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m256 y2 = *(__m256*)_ps_sincof_p0;
__m256 y2 = *(__m256 *)_ps_sincof_p0;
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1);
y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2);
y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
@ -576,27 +616,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
s1 = _mm256_extractf128_ps(sine, 0);
c1 = _mm256_extractf128_ps(cosine, 0);
aux = _mm_unpacklo_ps(c1, s1);
_mm_store_ps((float*)bPtr, aux);
_mm_store_ps((float *)bPtr, aux);
bPtr += 2;
aux = _mm_unpackhi_ps(c1, s1);
_mm_store_ps((float*)bPtr, aux);
_mm_store_ps((float *)bPtr, aux);
bPtr += 2;
s1 = _mm256_extractf128_ps(sine, 1);
c1 = _mm256_extractf128_ps(cosine, 1);
aux = _mm_unpacklo_ps(c1, s1);
_mm_store_ps((float*)bPtr, aux);
_mm_store_ps((float *)bPtr, aux);
bPtr += 2;
aux = _mm_unpackhi_ps(c1, s1);
_mm_store_ps((float*)bPtr, aux);
_mm_store_ps((float *)bPtr, aux);
bPtr += 2;
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
}
_mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8);
for(number = avx_iters * 8; number < num_points; number++)
for (number = avx_iters * 8; number < num_points; number++)
{
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc;
}
(*phase) = _phase;
@ -609,9 +649,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
#include <immintrin.h>
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/
* Adapted to AVX2 by Carles Fernandez, based on original SSE2 code by Julien Pommier*/
static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{
lv_32fc_t* bPtr = out;
lv_32fc_t *bPtr = out;
const unsigned int avx_iters = num_points / 8;
unsigned int number = 0;
@ -624,44 +664,64 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
__m128 aux, c1, s1;
/* declare some AXX2 constants */
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 };
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 };
__VOLK_ATTR_ALIGNED(32)
static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(32)
static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 };
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 };
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 };
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 };
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 };
__VOLK_ATTR_ALIGNED(32)
static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 };
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f };
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
__VOLK_ATTR_ALIGNED(32)
static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc };
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc };
__VOLK_ATTR_ALIGNED(32)
float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc};
__VOLK_ATTR_ALIGNED(32)
float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc};
eight_phases_reg = _mm256_load_ps(eight_phases);
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
for(;number < avx_iters; number++)
for (; number < avx_iters; number++)
{
x = eight_phases_reg;
sign_bit_sin = x;
/* take the absolute value */
x = _mm256_and_ps(x, *(__m256*)_ps_inv_sign_mask);
x = _mm256_and_ps(x, *(__m256 *)_ps_inv_sign_mask);
/* extract the sign bit (upper one) */
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256*)_ps_sign_mask);
sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(__m256 *)_ps_sign_mask);
/* scale by 4/Pi */
y = _mm256_mul_ps(x, *(__m256*)_ps_cephes_FOPI);
y = _mm256_mul_ps(x, *(__m256 *)_ps_cephes_FOPI);
/* store the integer part of y in emm2 */
emm2 = _mm256_cvttps_epi32(y);
@ -685,9 +745,9 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
/* The magic pass: "Extended precision modular arithmetic”
x = ((x - y * DP1) - y * DP2) - y * DP3; */
xmm1 = *(__m256*)_ps_minus_cephes_DP1;
xmm2 = *(__m256*)_ps_minus_cephes_DP2;
xmm3 = *(__m256*)_ps_minus_cephes_DP3;
xmm1 = *(__m256 *)_ps_minus_cephes_DP1;
xmm2 = *(__m256 *)_ps_minus_cephes_DP2;
xmm3 = *(__m256 *)_ps_minus_cephes_DP3;
xmm1 = _mm256_mul_ps(y, xmm1);
xmm2 = _mm256_mul_ps(y, xmm2);
xmm3 = _mm256_mul_ps(y, xmm3);
@ -704,24 +764,24 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
/* Evaluate the first polynom (0 <= x <= Pi/4) */
__m256 z = _mm256_mul_ps(x, x);
y = *(__m256*)_ps_coscof_p0;
y = *(__m256 *)_ps_coscof_p0;
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p1);
y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p1);
y = _mm256_mul_ps(y, z);
y = _mm256_add_ps(y, *(__m256*)_ps_coscof_p2);
y = _mm256_add_ps(y, *(__m256 *)_ps_coscof_p2);
y = _mm256_mul_ps(y, z);
y = _mm256_mul_ps(y, z);
__m256 tmp = _mm256_mul_ps(z, *(__m256*)_ps_0p5);
__m256 tmp = _mm256_mul_ps(z, *(__m256 *)_ps_0p5);
y = _mm256_sub_ps(y, tmp);
y = _mm256_add_ps(y, *(__m256*)_ps_1);
y = _mm256_add_ps(y, *(__m256 *)_ps_1);
/* Evaluate the second polynom (Pi/4 <= x <= 0) */
__m256 y2 = *(__m256*)_ps_sincof_p0;
__m256 y2 = *(__m256 *)_ps_sincof_p0;
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p1);
y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p1);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_add_ps(y2, *(__m256*)_ps_sincof_p2);
y2 = _mm256_add_ps(y2, *(__m256 *)_ps_sincof_p2);
y2 = _mm256_mul_ps(y2, z);
y2 = _mm256_mul_ps(y2, x);
y2 = _mm256_add_ps(y2, x);
@ -744,27 +804,27 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
s1 = _mm256_extractf128_ps(sine, 0);
c1 = _mm256_extractf128_ps(cosine, 0);
aux = _mm_unpacklo_ps(c1, s1);
_mm_storeu_ps((float*)bPtr, aux);
_mm_storeu_ps((float *)bPtr, aux);
bPtr += 2;
aux = _mm_unpackhi_ps(c1, s1);
_mm_storeu_ps((float*)bPtr, aux);
_mm_storeu_ps((float *)bPtr, aux);
bPtr += 2;
s1 = _mm256_extractf128_ps(sine, 1);
c1 = _mm256_extractf128_ps(cosine, 1);
aux = _mm_unpacklo_ps(c1, s1);
_mm_storeu_ps((float*)bPtr, aux);
_mm_storeu_ps((float *)bPtr, aux);
bPtr += 2;
aux = _mm_unpackhi_ps(c1, s1);
_mm_storeu_ps((float*)bPtr, aux);
_mm_storeu_ps((float *)bPtr, aux);
bPtr += 2;
eight_phases_reg = _mm256_add_ps(eight_phases_reg, eight_phases_inc_reg);
}
_mm256_zeroupper();
_phase = _phase + phase_inc * (avx_iters * 8);
for(number = avx_iters * 8; number < num_points; number++)
for (number = avx_iters * 8; number < num_points; number++)
{
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
out[number] = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc;
}
(*phase) = _phase;
@ -777,15 +837,17 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
#include <arm_neon.h>
/* Adapted from http://gruntthepeon.free.fr/ssemath/neon_mathfun.h, original code from Julien Pommier */
/* Based on algorithms from the cephes library http://www.netlib.org/cephes/ */
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const float phase_inc, float* phase, unsigned int num_points)
static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t *out, const float phase_inc, float *phase, unsigned int num_points)
{
lv_32fc_t* bPtr = out;
lv_32fc_t *bPtr = out;
const unsigned int neon_iters = num_points / 4;
float _phase = (*phase);
__VOLK_ATTR_ALIGNED(16) float32_t four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc };
__VOLK_ATTR_ALIGNED(16)
float32_t four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
float four_inc = 4 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t four_phases_inc[4] = { four_inc, four_inc, four_inc, four_inc };
__VOLK_ATTR_ALIGNED(16)
float32_t four_phases_inc[4] = {four_inc, four_inc, four_inc, four_inc};
float32x4_t four_phases_reg = vld1q_f32(four_phases);
float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc);
@ -808,7 +870,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
uint32x4_t emm2, poly_mask, sign_mask_sin, sign_mask_cos;
for(;number < neon_iters; number++)
for (; number < neon_iters; number++)
{
x = four_phases_reg;
@ -847,7 +909,7 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
/* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
and the second polynom (Pi/4 <= x <= 0) in y2 */
z = vmulq_f32(x,x);
z = vmulq_f32(x, x);
y1 = vmulq_n_f32(z, c_coscof_p0);
y2 = vmulq_n_f32(z, c_sincof_p0);
@ -871,16 +933,16 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
result.val[1] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
result.val[0] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
vst2q_f32((float32_t*)bPtr, result);
vst2q_f32((float32_t *)bPtr, result);
bPtr += 4;
four_phases_reg = vaddq_f32(four_phases_reg, four_phases_inc_reg);
}
_phase = _phase + phase_inc * (neon_iters * 4);
for(number = neon_iters * 4; number < num_points; number++)
for (number = neon_iters * 4; number < num_points; number++)
{
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase) );
*bPtr++ = lv_cmake((float)cosf(_phase), (float)sinf(_phase));
_phase += phase_inc;
}
(*phase) = _phase;

View File

@ -49,7 +49,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic(lv_32fc_t* out, c
volk_gnsssdr_s32f_sincos_32fc_generic(out, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_GENERIC */
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_GENERIC
@ -60,7 +60,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_generic_fxpt(lv_32fc_t* o
volk_gnsssdr_s32f_sincos_32fc_generic_fxpt(out, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_GENERIC */
#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE2
@ -70,7 +70,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_sse2(lv_32fc_t* out, co
phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_a_sse2(out, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_SSE2 */
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_SSE2
@ -80,7 +80,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_sse2(lv_32fc_t* out, co
phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_u_sse2(out, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_SSE2 */
#endif /* LV_HAVE_SSE2 */
#ifdef LV_HAVE_AVX2
@ -90,7 +90,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_a_avx2(lv_32fc_t* out, co
phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_a_avx2(out, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_AVX2
@ -100,7 +100,7 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_u_avx2(lv_32fc_t* out, co
phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_u_avx2(out, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_AVX2 */
#endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEON
@ -110,6 +110,6 @@ static inline void volk_gnsssdr_s32f_sincospuppet_32fc_neon(lv_32fc_t* out, cons
phase[0] = 3;
volk_gnsssdr_s32f_sincos_32fc_neon(out, phase_inc, phase, num_points);
}
#endif /* LV_HAVE_NEON */
#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */
#endif /* INCLUDED_volk_gnsssdr_s32f_sincospuppet_32fc_H */

View File

@ -38,32 +38,31 @@
// for puppets we need to get all the func_variants for the puppet and just
// keep track of the actual function name to write to results
#define VOLK_INIT_PUPP(func, puppet_master_func, test_params)\
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
std::string(#puppet_master_func), test_params)
#define VOLK_INIT_PUPP(func, puppet_master_func, test_params) \
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \
std::string(#puppet_master_func), test_params)
#define VOLK_INIT_TEST(func, test_params)\
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void(*)())func##_manual, std::string(#func),\
test_params)
#define VOLK_INIT_TEST(func, test_params) \
volk_gnsssdr_test_case_t(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), \
test_params)
#define QA(test) test_cases.push_back(test);
std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t test_params)
{
// Some kernels need a lower tolerance
volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
volk_gnsssdr_test_params_t test_params_int1 = volk_gnsssdr_test_params_t(1, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
// some others need more iterations ***** ADDED BY GNSS-SDR
volk_gnsssdr_test_params_t test_params_more_iters = volk_gnsssdr_test_params_t(test_params.tol(), test_params.scalar(),
test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex());
test_params.vlen(), 100000, test_params.benchmark_mode(), test_params.kernel_regex());
// ... or more tolerance ***** ADDED BY GNSS-SDR
volk_gnsssdr_test_params_t test_params_int16 = volk_gnsssdr_test_params_t(16, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
volk_gnsssdr_test_params_t test_params_inacc2 = volk_gnsssdr_test_params_t(2e-1, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
std::vector<volk_gnsssdr_test_case_t> test_cases;
@ -98,8 +97,7 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16))
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16))
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1))
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1))
;
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1));
return test_cases;
}

View File

@ -25,17 +25,18 @@
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
#include "volk_gnsssdr/volk_gnsssdr.h" // for volk_gnsssdr_func_desc_t
#include <cstdbool> // for bool, false
#include <cstdlib> // for NULL
#include <map> // for map
#include <string> // for string, basic_string
#include <vector> // for vector
#include <cstdbool> // for bool, false
#include <cstdlib> // for NULL
#include <map> // for map
#include <string> // for string, basic_string
#include <vector> // for vector
/************************************************
* VOLK QA type definitions *
************************************************/
struct volk_gnsssdr_type_t {
struct volk_gnsssdr_type_t
{
bool is_float;
bool is_scalar;
bool is_signed;
@ -44,80 +45,78 @@ struct volk_gnsssdr_type_t {
std::string str;
};
class volk_gnsssdr_test_time_t {
public:
std::string name;
double time;
std::string units;
bool pass;
class volk_gnsssdr_test_time_t
{
public:
std::string name;
double time;
std::string units;
bool pass;
};
class volk_gnsssdr_test_results_t {
public:
std::string name;
std::string config_name;
unsigned int vlen;
unsigned int iter;
std::map<std::string, volk_gnsssdr_test_time_t> results;
std::string best_arch_a;
std::string best_arch_u;
class volk_gnsssdr_test_results_t
{
public:
std::string name;
std::string config_name;
unsigned int vlen;
unsigned int iter;
std::map<std::string, volk_gnsssdr_test_time_t> results;
std::string best_arch_a;
std::string best_arch_u;
};
class volk_gnsssdr_test_params_t {
private:
float _tol;
lv_32fc_t _scalar;
unsigned int _vlen;
unsigned int _iter;
bool _benchmark_mode;
std::string _kernel_regex;
public:
// ctor
volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
bool benchmark_mode, std::string kernel_regex) :
_tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter),
_benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex) {};
// setters
void set_tol(float tol) {_tol=tol;};
void set_scalar(lv_32fc_t scalar) {_scalar=scalar;};
void set_vlen(unsigned int vlen) {_vlen=vlen;};
void set_iter(unsigned int iter) {_iter=iter;};
void set_benchmark(bool benchmark) {_benchmark_mode=benchmark;};
void set_regex(std::string regex) {_kernel_regex=regex;};
// getters
float tol() {return _tol;};
lv_32fc_t scalar() {return _scalar;};
unsigned int vlen() {return _vlen;};
unsigned int iter() {return _iter;};
bool benchmark_mode() {return _benchmark_mode;};
std::string kernel_regex() {return _kernel_regex;};
class volk_gnsssdr_test_params_t
{
private:
float _tol;
lv_32fc_t _scalar;
unsigned int _vlen;
unsigned int _iter;
bool _benchmark_mode;
std::string _kernel_regex;
public:
// ctor
volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
bool benchmark_mode, std::string kernel_regex) : _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex){};
// setters
void set_tol(float tol) { _tol = tol; };
void set_scalar(lv_32fc_t scalar) { _scalar = scalar; };
void set_vlen(unsigned int vlen) { _vlen = vlen; };
void set_iter(unsigned int iter) { _iter = iter; };
void set_benchmark(bool benchmark) { _benchmark_mode = benchmark; };
void set_regex(std::string regex) { _kernel_regex = regex; };
// getters
float tol() { return _tol; };
lv_32fc_t scalar() { return _scalar; };
unsigned int vlen() { return _vlen; };
unsigned int iter() { return _iter; };
bool benchmark_mode() { return _benchmark_mode; };
std::string kernel_regex() { return _kernel_regex; };
};
class volk_gnsssdr_test_case_t {
private:
volk_gnsssdr_func_desc_t _desc;
void(*_kernel_ptr)();
std::string _name;
volk_gnsssdr_test_params_t _test_parameters;
std::string _puppet_master_name;
public:
volk_gnsssdr_func_desc_t desc() {return _desc;};
void (*kernel_ptr()) () {return _kernel_ptr;};
std::string name() {return _name;};
std::string puppet_master_name() {return _puppet_master_name;};
volk_gnsssdr_test_params_t test_parameters() {return _test_parameters;};
// normal ctor
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name,
volk_gnsssdr_test_params_t test_parameters) :
_desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
_puppet_master_name("NULL")
{};
// ctor for puppets
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void(*kernel_ptr)(), std::string name,
std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) :
_desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
_puppet_master_name(puppet_master_name)
{};
class volk_gnsssdr_test_case_t
{
private:
volk_gnsssdr_func_desc_t _desc;
void (*_kernel_ptr)();
std::string _name;
volk_gnsssdr_test_params_t _test_parameters;
std::string _puppet_master_name;
public:
volk_gnsssdr_func_desc_t desc() { return _desc; };
void (*kernel_ptr())() { return _kernel_ptr; };
std::string name() { return _name; };
std::string puppet_master_name() { return _puppet_master_name; };
volk_gnsssdr_test_params_t test_parameters() { return _test_parameters; };
// normal ctor
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name,
volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name("NULL"){};
// ctor for puppets
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name,
std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name(puppet_master_name){};
};
/************************************************
@ -130,58 +129,57 @@ void random_floats(float *buf, unsigned n);
bool run_volk_gnsssdr_tests(
volk_gnsssdr_func_desc_t,
void(*)(),
void (*)(),
std::string,
volk_gnsssdr_test_params_t,
std::vector<volk_gnsssdr_test_results_t> *results = NULL,
std::string puppet_master_name = "NULL"
);
std::string puppet_master_name = "NULL");
bool run_volk_gnsssdr_tests(
volk_gnsssdr_func_desc_t,
void(*)(),
std::string,
float,
lv_32fc_t,
unsigned int,
unsigned int,
std::vector<volk_gnsssdr_test_results_t> *results = NULL,
std::string puppet_master_name = "NULL",
bool benchmark_mode = false
);
volk_gnsssdr_func_desc_t,
void (*)(),
std::string,
float,
lv_32fc_t,
unsigned int,
unsigned int,
std::vector<volk_gnsssdr_test_results_t> *results = NULL,
std::string puppet_master_name = "NULL",
bool benchmark_mode = false);
#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
BOOST_AUTO_TEST_CASE(func##_test) { \
BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
func##_get_func_desc(), (void (*)())func##_manual, \
std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
0); \
#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
BOOST_AUTO_TEST_CASE(func##_test) \
{ \
BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
func##_get_func_desc(), (void (*)())func##_manual, \
std::string(#func), tol, scalar, len, iter, 0, "NULL"), \
0); \
}
#define VOLK_PROFILE(func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, "NULL")
#define VOLK_PUPPET_PROFILE(func, puppet_master_func, test_params, results) run_volk_gnsssdr_tests(func##_get_func_desc(), (void (*)())func##_manual, std::string(#func), test_params, results, std::string(#puppet_master_func))
typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char*); //one input, operate in place
typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char*); //one input vector, one scalar float input
typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char*); //one input vector, one scalar float input
typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg)(void *, unsigned int, const char *); //one input, operate in place
typedef void (*volk_gnsssdr_fn_2arg)(void *, void *, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg)(void *, void *, void *, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_4arg)(void *, void *, void *, void *, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_1arg_s32f)(void *, float, unsigned int, const char *); //one input vector, one scalar float input
typedef void (*volk_gnsssdr_fn_2arg_s32f)(void *, void *, float, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s32f)(void *, void *, void *, float, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_1arg_s32fc)(void *, lv_32fc_t, unsigned int, const char *); //one input vector, one scalar float input
typedef void (*volk_gnsssdr_fn_2arg_s32fc)(void *, void *, lv_32fc_t, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s32fc)(void *, void *, void *, lv_32fc_t, unsigned int, const char *);
//ADDED BY GNSS-SDR. START
typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char*); //one input vector, one scalar char input
typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char*); //one input vector, one scalar lv_8sc_t vector input
typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char*); //one input vector, one scalar lv_16sc_t vector input
typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char*);
typedef void (*volk_gnsssdr_fn_1arg_s8i)(void *, char, unsigned int, const char *); //one input vector, one scalar char input
typedef void (*volk_gnsssdr_fn_2arg_s8i)(void *, void *, char, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s8i)(void *, void *, void *, char, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_1arg_s8ic)(void *, lv_8sc_t, unsigned int, const char *); //one input vector, one scalar lv_8sc_t vector input
typedef void (*volk_gnsssdr_fn_2arg_s8ic)(void *, void *, lv_8sc_t, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s8ic)(void *, void *, void *, lv_8sc_t, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_1arg_s16ic)(void *, lv_16sc_t, unsigned int, const char *); //one input vector, one scalar lv_16sc_t vector input
typedef void (*volk_gnsssdr_fn_2arg_s16ic)(void *, void *, lv_16sc_t, unsigned int, const char *);
typedef void (*volk_gnsssdr_fn_3arg_s16ic)(void *, void *, void *, lv_16sc_t, unsigned int, const char *);
//ADDED BY GNSS-SDR. END
#endif // GNSS_SDR_VOLK_QA_UTILS_H
#endif // GNSS_SDR_VOLK_QA_UTILS_H

View File

@ -18,16 +18,16 @@
*/
#include "kernel_tests.h" // for init_test_list
#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t
#include "kernel_tests.h" // for init_test_list
#include "qa_utils.h" // for volk_gnsssdr_test_case_t, volk_gnsssdr_test_results_t
#include "volk_gnsssdr/volk_gnsssdr_complex.h" // for lv_32fc_t
#include <cstdbool> // for bool, false, true
#include <iostream> // for operator<<, basic_ostream, endl, char...
#include <fstream> // IWYU pragma: keep
#include <map> // for map, map<>::iterator, _Rb_tree_iterator
#include <string> // for string, operator<<
#include <utility> // for pair
#include <vector> // for vector
#include <cstdbool> // for bool, false, true
#include <iostream> // for operator<<, basic_ostream, endl, char...
#include <fstream> // IWYU pragma: keep
#include <map> // for map, map<>::iterator, _Rb_tree_iterator
#include <string> // for string, operator<<
#include <utility> // for pair
#include <vector> // for vector
void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int nfails);
@ -49,38 +49,44 @@ int main()
std::vector<std::string> qa_failures;
std::vector<volk_gnsssdr_test_results_t> results;
// Test every kernel reporting failures when they occur
for(unsigned int ii = 0; ii < test_cases.size(); ++ii) {
bool qa_result = false;
volk_gnsssdr_test_case_t test_case = test_cases[ii];
try {
qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
test_case.test_parameters(), &results, test_case.puppet_master_name());
}
catch(...) {
// TODO: what exceptions might we need to catch and how do we handle them?
std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
qa_result = false;
}
for (unsigned int ii = 0; ii < test_cases.size(); ++ii)
{
bool qa_result = false;
volk_gnsssdr_test_case_t test_case = test_cases[ii];
try
{
qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
test_case.test_parameters(), &results, test_case.puppet_master_name());
}
catch (...)
{
// TODO: what exceptions might we need to catch and how do we handle them?
std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
qa_result = false;
}
if(qa_result) {
std::cerr << "Failure on " << test_case.name() << std::endl;
qa_failures.push_back(test_case.name());
if (qa_result)
{
std::cerr << "Failure on " << test_case.name() << std::endl;
qa_failures.push_back(test_case.name());
}
}
}
// Generate XML results
print_qa_xml(results, qa_failures.size());
// Summarize QA results
std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
<< test_cases.size() << " tests." << std::endl;
if(qa_failures.size() > 0) {
std::cerr << "The following kernels failed QA:" << std::endl;
for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) {
std::cerr << " " << qa_failures[ii] << std::endl;
<< test_cases.size() << " tests." << std::endl;
if (qa_failures.size() > 0)
{
std::cerr << "The following kernels failed QA:" << std::endl;
for (unsigned int ii = 0; ii < qa_failures.size(); ++ii)
{
std::cerr << " " << qa_failures[ii] << std::endl;
}
qa_ret_val = 1;
}
qa_ret_val = 1;
}
return qa_ret_val;
}
@ -95,34 +101,34 @@ void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int
qa_file.open(".unittest/kernels.xml");
qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
qa_file << "<testsuites name=\"kernels\" " <<
"tests=\"" << results.size() << "\" " <<
"failures=\"" << nfails << "\" id=\"1\">" << std::endl;
qa_file << "<testsuites name=\"kernels\" "
<< "tests=\"" << results.size() << "\" "
<< "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
// Results are in a vector by kernel. Each element has a result
// map containing time and arch name with test result
for(unsigned int ii=0; ii < results.size(); ++ii) {
volk_gnsssdr_test_results_t result = results[ii];
qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
for (unsigned int ii = 0; ii < results.size(); ++ii)
{
volk_gnsssdr_test_results_t result = results[ii];
qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) {
volk_gnsssdr_test_time_t test_time = kernel_time_pair->second;
qa_file << " <testcase name=\"" << test_time.name << "\" " <<
"classname=\"" << result.name << "\" " <<
"time=\"" << test_time.time << "\">" << std::endl;
if(!test_time.pass)
qa_file << " <failure " <<
"message=\"fail on arch " << test_time.name << "\">" <<
"</failure>" << std::endl;
qa_file << " </testcase>" << std::endl;
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
for (kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair)
{
volk_gnsssdr_test_time_t test_time = kernel_time_pair->second;
qa_file << " <testcase name=\"" << test_time.name << "\" "
<< "classname=\"" << result.name << "\" "
<< "time=\"" << test_time.time << "\">" << std::endl;
if (!test_time.pass)
qa_file << " <failure "
<< "message=\"fail on arch " << test_time.name << "\">"
<< "</failure>" << std::endl;
qa_file << " </testcase>" << std::endl;
}
qa_file << " </testsuite>" << std::endl;
}
qa_file << " </testsuite>" << std::endl;
}
qa_file << "</testsuites>" << std::endl;
qa_file.close();
}

View File

@ -43,15 +43,16 @@ void *volk_gnsssdr_malloc(size_t size, size_t alignment)
return malloc(size);
int err = posix_memalign(&ptr, alignment, size);
if(err == 0)
if (err == 0)
{
return ptr;
}
else
{
fprintf(stderr,
"VOLK_GNSSSDR: Error allocating memory "
"(posix_memalign: error %d: %s)\n", err, strerror(err));
"VOLK_GNSSSDR: Error allocating memory "
"(posix_memalign: error %d: %s)\n",
err, strerror(err));
return NULL;
}
}
@ -68,7 +69,7 @@ void volk_gnsssdr_free(void *ptr)
void *volk_gnsssdr_malloc(size_t size, size_t alignment)
{
void *ptr = _aligned_malloc(size, alignment);
if(ptr == NULL)
if (ptr == NULL)
{
fprintf(stderr, "VOLK_GNSSSDR: Error allocating memory (_aligned_malloc)\n");
}
@ -81,7 +82,7 @@ void volk_gnsssdr_free(void *ptr)
}
// No standard handlers; we'll do it ourselves.
#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
#else // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
struct block_info
{
@ -102,7 +103,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment)
real = malloc(size + (2 * alignment - 1));
/* Get pointer to the various zones */
user = (void *)((((uintptr_t) real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1));
user = (void *)((((uintptr_t)real) + sizeof(struct block_info) + alignment - 1) & ~(alignment - 1));
info = (struct block_info *)(((uintptr_t)user) - sizeof(struct block_info));
/* Store the info for the free */
@ -112,8 +113,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment)
return user;
}
void
volk_gnsssdr_free(void *ptr)
void volk_gnsssdr_free(void *ptr)
{
struct block_info *info;
@ -124,6 +124,6 @@ volk_gnsssdr_free(void *ptr)
free(info->real);
}
#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
#endif // _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
//#endif // _ISOC11_SOURCE

View File

@ -26,16 +26,17 @@ void volk_gnsssdr_get_config_path(char *path)
{
if (!path) return;
const char *suffix = "/.volk_gnsssdr/volk_gnsssdr_config";
const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; //non-hidden
const char *suffix2 = "/volk_gnsssdr/volk_gnsssdr_config"; // non-hidden
char *home = NULL;
//allows config redirection via env variable
home = getenv("VOLK_CONFIGPATH");
if(home!=NULL){
strncpy(path,home,512);
strcat(path,suffix2);
return;
}
if (home != NULL)
{
strncpy(path, home, 512);
strcat(path, suffix2);
return;
}
if (home == NULL) home = getenv("HOME");
if (home == NULL) home = getenv("APPDATA");
@ -57,16 +58,16 @@ size_t volk_gnsssdr_load_preferences(volk_gnsssdr_arch_pref_t **prefs_res)
//get the config path
volk_gnsssdr_get_config_path(path);
if (!path[0]) return n_arch_prefs; //no prefs found
if (!path[0]) return n_arch_prefs; //no prefs found
config_file = fopen(path, "r");
if(!config_file) return n_arch_prefs; //no prefs found
if (!config_file) return n_arch_prefs; //no prefs found
//reset the file pointer and write the prefs into volk_gnsssdr_arch_prefs
while(fgets(line, sizeof(line), config_file) != NULL)
while (fgets(line, sizeof(line), config_file) != NULL)
{
prefs = (volk_gnsssdr_arch_pref_t *) realloc(prefs, (n_arch_prefs+1) * sizeof(*prefs));
prefs = (volk_gnsssdr_arch_pref_t *)realloc(prefs, (n_arch_prefs + 1) * sizeof(*prefs));
volk_gnsssdr_arch_pref_t *p = prefs + n_arch_prefs;
if(sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5))
if (sscanf(line, "%s %s %s", p->name, p->impl_a, p->impl_u) == 3 && !strncmp(p->name, "volk_gnsssdr_", 5))
{
n_arch_prefs++;
}

View File

@ -29,7 +29,7 @@
inline unsigned __popcnt(unsigned num)
{
unsigned pop = 0;
while(num)
while (num)
{
if (num & 0x1) pop++;
num >>= 1;
@ -39,15 +39,15 @@ inline unsigned __popcnt(unsigned num)
#endif
int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find
)
const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find
)
{
unsigned int i;
for (i = 0; i < n_impls; i++)
{
if(!strncmp(impl_names[i], impl_name, 20))
if (!strncmp(impl_names[i], impl_name, 20))
{
return i;
}
@ -55,24 +55,24 @@ int volk_gnsssdr_get_index(
//TODO return -1;
//something terrible should happen here
fprintf(stderr, "VOLK_GNSSSDR warning: no arch found, returning generic impl\n");
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
}
int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name
const int* impl_deps, //requirement mask per implementation
const bool* alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations
const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name
const int *impl_deps, //requirement mask per implementation
const bool *alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations
)
{
size_t i;
static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs;
static size_t n_arch_prefs = 0;
static int prefs_loaded = 0;
if(!prefs_loaded)
if (!prefs_loaded)
{
n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs);
prefs_loaded = 1;
@ -81,17 +81,17 @@ int volk_gnsssdr_rank_archs(
// If we've defined VOLK_GENERIC to be anything, always return the
// 'generic' kernel. Used in GR's QA code.
char *gen_env = getenv("VOLK_GENERIC");
if(gen_env)
if (gen_env)
{
return volk_gnsssdr_get_index(impl_names, n_impls, "generic");
}
//now look for the function name in the prefs list
for(i = 0; i < n_arch_prefs; i++)
for (i = 0; i < n_arch_prefs; i++)
{
if(!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it
if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it
{
const char *impl_name = align? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u;
const char *impl_name = align ? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u;
return volk_gnsssdr_get_index(impl_names, n_impls, impl_name);
}
}
@ -101,7 +101,7 @@ int volk_gnsssdr_rank_archs(
size_t best_index_u = 0;
int best_value_a = -1;
int best_value_u = -1;
for(i = 0; i < n_impls; i++)
for (i = 0; i < n_impls; i++)
{
const signed val = __popcnt(impl_deps[i]);
if (alignment[i] && val > best_value_a)

View File

@ -23,23 +23,24 @@
#include <stdbool.h>
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find
);
int volk_gnsssdr_get_index(
const char *impl_names[], //list of implementations by name
const size_t n_impls, //number of implementations available
const char *impl_name //the implementation name to find
);
int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name
const int* impl_deps, //requirement mask per implementation
const bool* alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations
);
int volk_gnsssdr_rank_archs(
const char *kern_name, //name of the kernel to rank
const char *impl_names[], //list of implementations by name
const int *impl_deps, //requirement mask per implementation
const bool *alignment, //alignment status of each implementation
size_t n_impls, //number of implementations available
const bool align //if false, filter aligned implementations
);
#ifdef __cplusplus
}

View File

@ -31,80 +31,90 @@ static intptr_t __alignment_mask = 0;
struct volk_gnsssdr_machine *get_machine(void)
{
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines;
static struct volk_gnsssdr_machine *machine = NULL;
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines;
static struct volk_gnsssdr_machine *machine = NULL;
if(machine != NULL)
return machine;
else {
unsigned int max_score = 0;
unsigned int i;
struct volk_gnsssdr_machine *max_machine = NULL;
for(i=0; i<n_volk_gnsssdr_machines; i++) {
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
if(volk_gnsssdr_machines[i]->caps > max_score) {
max_score = volk_gnsssdr_machines[i]->caps;
max_machine = volk_gnsssdr_machines[i];
if (machine != NULL)
return machine;
else
{
unsigned int max_score = 0;
unsigned int i;
struct volk_gnsssdr_machine *max_machine = NULL;
for (i = 0; i < n_volk_gnsssdr_machines; i++)
{
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
{
if (volk_gnsssdr_machines[i]->caps > max_score)
{
max_score = volk_gnsssdr_machines[i]->caps;
max_machine = volk_gnsssdr_machines[i];
}
}
}
machine = max_machine;
//printf("Using Volk machine: %s\n", machine->name);
__alignment = machine->alignment;
__alignment_mask = (intptr_t)(__alignment - 1);
return machine;
}
}
}
machine = max_machine;
//printf("Using Volk machine: %s\n", machine->name);
__alignment = machine->alignment;
__alignment_mask = (intptr_t)(__alignment-1);
return machine;
}
}
void volk_gnsssdr_list_machines(void)
{
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines;
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines;
unsigned int i;
for(i=0; i<n_volk_gnsssdr_machines; i++) {
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
printf("%s;", volk_gnsssdr_machines[i]->name);
}
}
printf("\n");
unsigned int i;
for (i = 0; i < n_volk_gnsssdr_machines; i++)
{
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
{
printf("%s;", volk_gnsssdr_machines[i]->name);
}
}
printf("\n");
}
const char* volk_gnsssdr_get_machine(void)
const char *volk_gnsssdr_get_machine(void)
{
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines;
static struct volk_gnsssdr_machine *machine = NULL;
extern struct volk_gnsssdr_machine *volk_gnsssdr_machines[];
extern unsigned int n_volk_gnsssdr_machines;
static struct volk_gnsssdr_machine *machine = NULL;
if(machine != NULL)
return machine->name;
else {
unsigned int max_score = 0;
unsigned int i;
struct volk_gnsssdr_machine *max_machine = NULL;
for(i=0; i<n_volk_gnsssdr_machines; i++) {
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) {
if(volk_gnsssdr_machines[i]->caps > max_score) {
max_score = volk_gnsssdr_machines[i]->caps;
max_machine = volk_gnsssdr_machines[i];
if (machine != NULL)
return machine->name;
else
{
unsigned int max_score = 0;
unsigned int i;
struct volk_gnsssdr_machine *max_machine = NULL;
for (i = 0; i < n_volk_gnsssdr_machines; i++)
{
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
{
if (volk_gnsssdr_machines[i]->caps > max_score)
{
max_score = volk_gnsssdr_machines[i]->caps;
max_machine = volk_gnsssdr_machines[i];
}
}
}
machine = max_machine;
return machine->name;
}
}
}
machine = max_machine;
return machine->name;
}
}
size_t volk_gnsssdr_get_alignment(void)
{
get_machine(); //ensures alignment is set
get_machine(); //ensures alignment is set
return __alignment;
}
bool volk_gnsssdr_is_aligned(const void *ptr)
{
return ((intptr_t)(ptr) & __alignment_mask) == 0;
return ((intptr_t)(ptr)&__alignment_mask) == 0;
}
#define LV_HAVE_GENERIC
@ -113,13 +123,12 @@ bool volk_gnsssdr_is_aligned(const void *ptr)
%for kern in kernels:
%if kern.has_dispatcher:
#include <volk_gnsssdr/${kern.name}.h> //pulls in the dispatcher
#include <volk_gnsssdr/${kern.name}.h> //pulls in the dispatcher
%endif
static inline void __${kern.name}_d(${kern.arglist_full})
{
%if kern.has_dispatcher:
${kern.name}_dispatcher(${kern.arglist_names});
% if kern.has_dispatcher : ${kern.name} _dispatcher(${kern.arglist_names});
return;
%endif
@ -131,41 +140,41 @@ static inline void __${kern.name}_d(${kern.arglist_full})
%endfor
0<% end_open_parens = ')'*num_open_parens %>${end_open_parens}
)){
${kern.name}_a(${kern.arglist_names});
${kern.name} _a(${kern.arglist_names});
}
else{
${kern.name}_u(${kern.arglist_names});
${kern.name} _u(${kern.arglist_names});
}
}
static inline void __init_${kern.name}(void)
{
const char *name = get_machine()->${kern.name}_name;
const char **impl_names = get_machine()->${kern.name}_impl_names;
const int *impl_deps = get_machine()->${kern.name}_impl_deps;
const bool *alignment = get_machine()->${kern.name}_impl_alignment;
const size_t n_impls = get_machine()->${kern.name}_n_impls;
const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true/*aligned*/);
const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false/*unaligned*/);
${kern.name}_a = get_machine()->${kern.name}_impls[index_a];
${kern.name}_u = get_machine()->${kern.name}_impls[index_u];
const char *name = get_machine()->${kern.name} _name;
const char **impl_names = get_machine()->${kern.name} _impl_names;
const int *impl_deps = get_machine()->${kern.name} _impl_deps;
const bool *alignment = get_machine()->${kern.name} _impl_alignment;
const size_t n_impls = get_machine()->${kern.name} _n_impls;
const size_t index_a = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, true /*aligned*/);
const size_t index_u = volk_gnsssdr_rank_archs(name, impl_names, impl_deps, alignment, n_impls, false /*unaligned*/);
${kern.name} _a = get_machine()->${kern.name} _impls[index_a];
${kern.name} _u = get_machine()->${kern.name} _impls[index_u];
assert(${kern.name}_a);
assert(${kern.name}_u);
assert(${kern.name} _a);
assert(${kern.name} _u);
${kern.name} = &__${kern.name}_d;
${kern.name} = &__${kern.name} _d;
}
static inline void __${kern.name}_a(${kern.arglist_full})
static inline void __${kern.name} _a(${kern.arglist_full})
{
__init_${kern.name}();
${kern.name}_a(${kern.arglist_names});
${kern.name} _a(${kern.arglist_names});
}
static inline void __${kern.name}_u(${kern.arglist_full})
static inline void __${kern.name} _u(${kern.arglist_full})
{
__init_${kern.name}();
${kern.name}_u(${kern.arglist_names});
${kern.name} _u(${kern.arglist_names});
}
static inline void __${kern.name}(${kern.arglist_full})
@ -174,34 +183,32 @@ static inline void __${kern.name}(${kern.arglist_full})
${kern.name}(${kern.arglist_names});
}
${kern.pname} ${kern.name}_a = &__${kern.name}_a;
${kern.pname} ${kern.name}_u = &__${kern.name}_u;
${kern.pname} ${kern.name} = &__${kern.name};
${kern.pname} ${kern.name} _a = &__${kern.name} _a;
${kern.pname} ${kern.name} _u = &__${kern.name} _u;
${kern.pname} ${kern.name} = &__${kern.name};
void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name)
void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name)
{
const int index = volk_gnsssdr_get_index(
get_machine()->${kern.name}_impl_names,
get_machine()->${kern.name}_n_impls,
impl_name
);
get_machine()->${kern.name}_impls[index](
${kern.arglist_names}
);
get_machine()->${kern.name} _impl_names,
get_machine()->${kern.name} _n_impls,
impl_name);
get_machine()->${kern.name} _impls[index](
${kern.arglist_names});
}
volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) {
const char **impl_names = get_machine()->${kern.name}_impl_names;
const int *impl_deps = get_machine()->${kern.name}_impl_deps;
const bool *alignment = get_machine()->${kern.name}_impl_alignment;
const size_t n_impls = get_machine()->${kern.name}_n_impls;
volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void)
{
const char **impl_names = get_machine()->${kern.name} _impl_names;
const int *impl_deps = get_machine()->${kern.name} _impl_deps;
const bool *alignment = get_machine()->${kern.name} _impl_alignment;
const size_t n_impls = get_machine()->${kern.name} _n_impls;
volk_gnsssdr_func_desc_t desc = {
impl_names,
impl_deps,
alignment,
n_impls
};
n_impls};
return desc;
}
%endfor
% endfor

View File

@ -42,7 +42,7 @@ typedef struct volk_gnsssdr_func_desc
VOLK_API void volk_gnsssdr_list_machines(void);
//! Returns the name of the machine this instance will use
VOLK_API const char* volk_gnsssdr_get_machine(void);
VOLK_API const char *volk_gnsssdr_get_machine(void);
//! Get the machine alignment in bytes
VOLK_API size_t volk_gnsssdr_get_alignment(void);
@ -74,19 +74,19 @@ VOLK_API bool volk_gnsssdr_is_aligned(const void *ptr);
extern VOLK_API ${kern.pname} ${kern.name};
//! A function pointer to the fastest aligned implementation
extern VOLK_API ${kern.pname} ${kern.name}_a;
extern VOLK_API ${kern.pname} ${kern.name} _a;
//! A function pointer to the fastest unaligned implementation
extern VOLK_API ${kern.pname} ${kern.name}_u;
extern VOLK_API ${kern.pname} ${kern.name} _u;
//! Call into a specific implementation given by name
extern VOLK_API void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name);
extern VOLK_API void ${kern.name} _manual(${kern.arglist_full}, const char *impl_name);
//! Get description parameters for this kernel
extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void);
%endfor
extern VOLK_API volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void);
% endfor
__VOLK_DECL_END
__VOLK_DECL_END
#endif /*INCLUDED_VOLK_GNSSSDR_RUNTIME*/

View File

@ -21,7 +21,8 @@
%for i, arch in enumerate(archs):
//#ifndef LV_${arch.name.upper()}
#define LV_${arch.name.upper()} ${i}
#define LV_$ \
{arch.name.upper()} $ { i }
//#endif
%endfor

View File

@ -24,50 +24,54 @@
struct VOLK_CPU volk_gnsssdr_cpu;
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)
#define VOLK_CPU_x86
#define VOLK_CPU_x86
#endif
#if defined(VOLK_CPU_x86)
//implement get cpuid for gcc compilers using a system or local copy of cpuid.h
#if defined(__GNUC__)
#include <cpuid.h>
#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r+0, (unsigned int *)r+1, (unsigned int *)r+2, (unsigned int *)r+3)
#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int*)regs), *((unsigned int*)regs+1), *((unsigned int*)regs+2), *((unsigned int*)regs+3))
#include <cpuid.h>
#define cpuid_x86(op, r) __get_cpuid(op, (unsigned int *)r + 0, (unsigned int *)r + 1, (unsigned int *)r + 2, (unsigned int *)r + 3)
#define cpuid_x86_count(op, count, regs) __cpuid_count(op, count, *((unsigned int *)regs), *((unsigned int *)regs + 1), *((unsigned int *)regs + 2), *((unsigned int *)regs + 3))
/* Return Intel AVX extended CPU capabilities register.
/* Return Intel AVX extended CPU capabilities register.
* This function will bomb on non-AVX-capable machines, so
* check for AVX capability before executing.
*/
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
static inline unsigned long long _xgetbv(unsigned int index){
unsigned int eax, edx;
__VOLK_ASM __VOLK_VOLATILE ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
return ((unsigned long long)edx << 32) | eax;
}
#define __xgetbv() _xgetbv(0)
#else
#define __xgetbv() 0
#endif
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
static inline unsigned long long _xgetbv(unsigned int index)
{
unsigned int eax, edx;
__VOLK_ASM __VOLK_VOLATILE("xgetbv"
: "=a"(eax), "=d"(edx)
: "c"(index));
return ((unsigned long long)edx << 32) | eax;
}
#define __xgetbv() _xgetbv(0)
#else
#define __xgetbv() 0
#endif
//implement get cpuid for MSVC compilers using __cpuid intrinsic
#elif defined(_MSC_VER) && defined(HAVE_INTRIN_H)
#include <intrin.h>
#define cpuid_x86(op, r) __cpuid(((int*)r), op)
#include <intrin.h>
#define cpuid_x86(op, r) __cpuid(((int *)r), op)
#if defined(_XCR_XFEATURE_ENABLED_MASK)
#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
#else
#define __xgetbv() 0
#endif
#if defined(_XCR_XFEATURE_ENABLED_MASK)
#define __xgetbv() _xgetbv(_XCR_XFEATURE_ENABLED_MASK)
#else
#define __xgetbv() 0
#endif
#else
#error "A get cpuid for volk_gnsssdr is not available on this compiler..."
#endif //defined(__GNUC__)
#error "A get cpuid for volk_gnsssdr is not available on this compiler..."
#endif //defined(__GNUC__)
#endif //defined(VOLK_CPU_x86)
#endif //defined(VOLK_CPU_x86)
static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) {
static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit)
{
#if defined(VOLK_CPU_x86)
unsigned int regs[4] = {0};
cpuid_x86_count(level, count, regs);
@ -77,10 +81,11 @@ static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int
#endif
}
static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) {
static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit)
{
#if defined(VOLK_CPU_x86)
unsigned int regs[4];
memset(regs, 0, sizeof(unsigned int)*4);
memset(regs, 0, sizeof(unsigned int) * 4);
cpuid_x86(op, regs);
return regs[reg] >> bit & 0x01;
#else
@ -88,10 +93,11 @@ static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsi
#endif
}
static inline unsigned int check_extended_cpuid(unsigned int val) {
static inline unsigned int check_extended_cpuid(unsigned int val)
{
#if defined(VOLK_CPU_x86)
unsigned int regs[4];
memset(regs, 0, sizeof(unsigned int)*4);
memset(regs, 0, sizeof(unsigned int) * 4);
cpuid_x86(0x80000000, regs);
return regs[0] >= val;
#else
@ -99,7 +105,8 @@ static inline unsigned int check_extended_cpuid(unsigned int val) {
#endif
}
static inline unsigned int get_avx_enabled(void) {
static inline unsigned int get_avx_enabled(void)
{
#if defined(VOLK_CPU_x86)
return __xgetbv() & 0x6;
#else
@ -107,7 +114,8 @@ static inline unsigned int get_avx_enabled(void) {
#endif
}
static inline unsigned int get_avx2_enabled(void) {
static inline unsigned int get_avx2_enabled(void)
{
#if defined(VOLK_CPU_x86)
return __xgetbv() & 0x6;
#else
@ -117,28 +125,30 @@ static inline unsigned int get_avx2_enabled(void) {
//neon detection is linux specific
#if defined(__arm__) && defined(__linux__)
#include <asm/hwcap.h>
#include <linux/auxvec.h>
#include <stdio.h>
#define VOLK_CPU_ARM
#include <asm/hwcap.h>
#include <linux/auxvec.h>
#include <stdio.h>
#define VOLK_CPU_ARM
#endif
static int has_neon(void){
static int has_neon(void)
{
#if defined(VOLK_CPU_ARM)
FILE *auxvec_f;
unsigned long auxvec[2];
unsigned int found_neon = 0;
auxvec_f = fopen("/proc/self/auxv", "rb");
if(!auxvec_f) return 0;
if (!auxvec_f) return 0;
size_t r = 1;
//so auxv is basically 32b of ID and 32b of value
//so it goes like this
while(!found_neon && r) {
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
if((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
found_neon = 1;
}
while (!found_neon && r)
{
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
found_neon = 1;
}
fclose(auxvec_f);
return found_neon;
@ -148,50 +158,59 @@ static int has_neon(void){
}
%for arch in archs:
static int i_can_has_${arch.name} (void) {
static int i_can_has_${arch.name} (void)
{
%for check, params in arch.checks:
if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0;
%endfor
return 1;
% endfor return 1;
}
%endfor
% endfor
#if defined(HAVE_FENV_H)
#if defined(FE_TONEAREST)
#include <fenv.h>
static inline void set_float_rounding(void){
fesetround(FE_TONEAREST);
}
#else
static inline void set_float_rounding(void){
//do nothing
}
#endif
#elif defined(_MSC_VER)
#include <float.h>
static inline void set_float_rounding(void){
unsigned int cwrd;
_controlfp_s(&cwrd, 0, 0);
_controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
}
#if defined(FE_TONEAREST)
#include <fenv.h>
static inline void
set_float_rounding(void)
{
fesetround(FE_TONEAREST);
}
#else
static inline void set_float_rounding(void){
//do nothing
}
static inline void
set_float_rounding(void)
{
//do nothing
}
#endif
#elif defined(_MSC_VER)
#include <float.h>
static inline void
set_float_rounding(void)
{
unsigned int cwrd;
_controlfp_s(&cwrd, 0, 0);
_controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
}
#else
static inline void
set_float_rounding(void)
{
//do nothing
}
#endif
void volk_gnsssdr_cpu_init() {
void volk_gnsssdr_cpu_init()
{
%for arch in archs:
volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name};
%endfor
set_float_rounding();
% endfor
set_float_rounding();
}
unsigned int volk_gnsssdr_get_lvarch() {
unsigned int volk_gnsssdr_get_lvarch()
{
unsigned int retval = 0;
volk_gnsssdr_cpu_init();
%for arch in archs:
retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()};
%endfor
return retval;
% endfor return retval;
}

View File

@ -23,16 +23,17 @@
__VOLK_DECL_BEGIN
struct VOLK_CPU {
struct VOLK_CPU
{
%for arch in archs:
int (*has_${arch.name}) ();
%endfor
% endfor
};
extern struct VOLK_CPU volk_gnsssdr_cpu;
void volk_gnsssdr_cpu_init ();
unsigned int volk_gnsssdr_get_lvarch ();
void volk_gnsssdr_cpu_init();
unsigned int volk_gnsssdr_get_lvarch();
__VOLK_DECL_END

View File

@ -20,7 +20,11 @@
<% arch_names = this_machine.arch_names %>
%for arch in this_machine.archs:
#define LV_HAVE_${arch.name.upper()} 1
#define LV_HAVE_$ \
{ \
arch.name.upper() \
} \
1
%endfor
#include <volk_gnsssdr/volk_gnsssdr_common.h>
@ -35,7 +39,9 @@
#include <volk_gnsssdr/${kern.name}.h>
%endfor
struct volk_gnsssdr_machine volk_gnsssdr_machine_${this_machine.name} = {
struct volk_gnsssdr_machine volk_gnsssdr_machine_$
{
this_machine.name} = {
<% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list},
<% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name},
${this_machine.alignment},

View File

@ -22,10 +22,10 @@
struct volk_gnsssdr_machine *volk_gnsssdr_machines[] = {
%for machine in machines:
#ifdef LV_MACHINE_${machine.name.upper()}
#ifdef LV_MACHINE_${machine.name.upper() }
&volk_gnsssdr_machine_${machine.name},
#endif
%endfor
};
unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines)/sizeof(*volk_gnsssdr_machines);
unsigned int n_volk_gnsssdr_machines = sizeof(volk_gnsssdr_machines) / sizeof(*volk_gnsssdr_machines);

View File

@ -27,26 +27,30 @@
__VOLK_DECL_BEGIN
struct volk_gnsssdr_machine {
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format)
struct volk_gnsssdr_machine
{
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format)
const char *name;
const size_t alignment; //the maximum byte alignment required for functions in this library
const size_t alignment; //the maximum byte alignment required for functions in this library
%for kern in kernels:
const char *${kern.name}_name;
const char *${kern.name}_impl_names[<%len_archs=len(archs)%>${len_archs}];
const int ${kern.name}_impl_deps[${len_archs}];
const bool ${kern.name}_impl_alignment[${len_archs}];
const ${kern.pname} ${kern.name}_impls[${len_archs}];
const size_t ${kern.name}_n_impls;
%endfor
const char *${kern.name} _impl_names[<% len_archs = len(archs) %> ${len_archs}];
const int ${kern.name} _impl_deps[${len_archs}];
const bool ${kern.name} _impl_alignment[${len_archs}];
const ${kern.pname} ${kern.name} _impls[${len_archs}];
const size_t ${kern.name} _n_impls;
% endfor
};
%for machine in machines:
#ifdef LV_MACHINE_${machine.name.upper()}
extern struct volk_gnsssdr_machine volk_gnsssdr_machine_${machine.name};
#ifdef LV_MACHINE_${machine.name.upper() }
extern struct volk_gnsssdr_machine volk_gnsssdr_machine_$
{
machine.name
};
#endif
%endfor
% endfor
__VOLK_DECL_END
__VOLK_DECL_END
#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H
#endif //INCLUDED_LIBVOLK_GNSSSDR_MACHINES_H

View File

@ -24,6 +24,6 @@
%for kern in kernels:
typedef void (*${kern.pname})(${kern.arglist_types});
%endfor
% endfor
#endif /*INCLUDED_VOLK_GNSSSDR_TYPEDEFS*/