1
0
mirror of https://github.com/gnss-sdr/gnss-sdr synced 2024-12-15 20:50:33 +00:00

Merge branch 'next' of https://github.com/gnss-sdr/gnss-sdr into next

This commit is contained in:
Carles Fernandez 2018-03-03 12:32:35 +01:00
commit b2492c8ed2
77 changed files with 6648 additions and 6122 deletions

View File

@ -128,6 +128,9 @@ $ git pull --rebase upstream next
### How to submit a pull request ### How to submit a pull request
Before submitting you code, please be sure to apply clang-format
(see http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting).
When the contribution is ready, you can [submit a pull When the contribution is ready, you can [submit a pull
request](https://github.com/gnss-sdr/gnss-sdr/compare/). Head to your request](https://github.com/gnss-sdr/gnss-sdr/compare/). Head to your
GitHub repository, switch to your `my_feature` branch, and click the GitHub repository, switch to your `my_feature` branch, and click the

View File

@ -5,7 +5,8 @@ Before submitting your pull request, please make sure the following is done:
2. If you are a first-time contributor, after your pull request you will be asked to sign an Individual Contributor License Agreement ([CLA](https://en.wikipedia.org/wiki/Contributor_License_Agreement)) before your code gets accepted into `master`. This license is for your protection as a Contributor as well as for the protection of [CTTC](http://www.cttc.es/); it does not change your rights to use your own contributions for any other purpose. Except for the license granted therein to CTTC and recipients of software distributed by CTTC, you reserve all right, title, and interest in and to your contributions. The information you provide in that CLA will be maintained in accordance with [CTTC's privacy policy](http://www.cttc.es/privacy/). 2. If you are a first-time contributor, after your pull request you will be asked to sign an Individual Contributor License Agreement ([CLA](https://en.wikipedia.org/wiki/Contributor_License_Agreement)) before your code gets accepted into `master`. This license is for your protection as a Contributor as well as for the protection of [CTTC](http://www.cttc.es/); it does not change your rights to use your own contributions for any other purpose. Except for the license granted therein to CTTC and recipients of software distributed by CTTC, you reserve all right, title, and interest in and to your contributions. The information you provide in that CLA will be maintained in accordance with [CTTC's privacy policy](http://www.cttc.es/privacy/).
3. You have read the [Contributing Guidelines](https://github.com/gnss-sdr/gnss-sdr/blob/master/CONTRIBUTING.md). 3. You have read the [Contributing Guidelines](https://github.com/gnss-sdr/gnss-sdr/blob/master/CONTRIBUTING.md).
4. You have read the [coding style guide](http://gnss-sdr.org/coding-style/). 4. You have read the [coding style guide](http://gnss-sdr.org/coding-style/).
5. You have forked the [gnss-sdr upstream repository](https://github.com/gnss-sdr/gnss-sdr) and have created your branch from `next` (or any other currently living branch in the upstream repository). 5. Specifically, you have read [about clang-format](http://gnss-sdr.org/coding-style/#use-tools-for-automated-code-formatting) and you have applied it.
6. Please include a description of your changes here. 6. You have forked the [gnss-sdr upstream repository](https://github.com/gnss-sdr/gnss-sdr) and have created your branch from `next` (or any other currently living branch in the upstream repository).
7. Please include a description of your changes here.
**Please feel free to delete this line and the above text once you have read it and in case you want to go on with your pull request.** **Please feel free to delete this line and the above text once you have read it and in case you want to go on with your pull request.**

View File

@ -54,8 +54,10 @@ int main(int argc, char **argv)
our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler())); our_options.add(option_t("cc", "", "print the VOLK_GNSSDR C compiler version", volk_gnsssdr_c_compiler()));
our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags())); our_options.add(option_t("cflags", "", "print the VOLK_GNSSSDR CFLAGS", volk_gnsssdr_compiler_flags()));
our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines())); our_options.add(option_t("all-machines", "", "print VOLK_GNSSSDR machines built", volk_gnsssdr_available_machines()));
our_options.add(option_t("avail-machines", "", "print VOLK_GNSSSDR machines on the current " our_options.add(option_t("avail-machines", "",
"platform", volk_gnsssdr_list_machines)); "print VOLK_GNSSSDR machines on the current "
"platform",
volk_gnsssdr_list_machines));
our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used", our_options.add(option_t("machine", "", "print the current VOLK_GNSSSDR machine that will be used",
volk_gnsssdr_get_machine())); volk_gnsssdr_get_machine()));
our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment)); our_options.add(option_t("alignment", "", "print the memory alignment", print_alignment));

View File

@ -25,7 +25,6 @@
#include <utility> // for pair #include <utility> // for pair
/* /*
* Option type * Option type
*/ */
@ -70,55 +69,74 @@ option_t::option_t(std::string longform, std::string shortform, std::string msg,
* Option List * Option List
*/ */
option_list::option_list(std::string program_name) : option_list::option_list(std::string program_name) : program_name(program_name)
program_name(program_name) { {
{ internal_list = std::vector<option_t>(); } {
internal_list = std::vector<option_t>();
}
} }
void option_list::add(const option_t &opt) { internal_list.push_back(opt); } void option_list::add(const option_t &opt) { internal_list.push_back(opt); }
void option_list::parse(int argc, char **argv) { void option_list::parse(int argc, char **argv)
for (int arg_number = 0; arg_number < argc; ++arg_number) { {
for (int arg_number = 0; arg_number < argc; ++arg_number)
{
for (std::vector<option_t>::iterator this_option = internal_list.begin(); for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end(); this_option != internal_list.end();
this_option++) { this_option++)
{
if (this_option->longform == std::string(argv[arg_number]) || if (this_option->longform == std::string(argv[arg_number]) ||
this_option->shortform == std::string(argv[arg_number])) { this_option->shortform == std::string(argv[arg_number]))
switch (this_option->option_type) { {
switch (this_option->option_type)
{
case VOID_CALLBACK: case VOID_CALLBACK:
this_option->callback(); this_option->callback();
break; break;
case INT_CALLBACK: case INT_CALLBACK:
try { try
{
int int_val = std::stoi(argv[++arg_number]); int int_val = std::stoi(argv[++arg_number]);
((void (*)(int))this_option->callback)(int_val); ((void (*)(int))this_option->callback)(int_val);
} catch (std::exception &exc) { }
catch (std::exception &exc)
{
std::cout << "An int option can only receive a number" << std::endl; std::cout << "An int option can only receive a number" << std::endl;
throw std::exception(); throw std::exception();
}; };
break; break;
case FLOAT_CALLBACK: case FLOAT_CALLBACK:
try { try
{
int int_val = std::stof(argv[++arg_number]); int int_val = std::stof(argv[++arg_number]);
((void (*)(float))this_option->callback)(int_val); ((void (*)(float))this_option->callback)(int_val);
} catch (std::exception &exc) { }
catch (std::exception &exc)
{
std::cout << "A float option can only receive a number" << std::endl; std::cout << "A float option can only receive a number" << std::endl;
throw std::exception(); throw std::exception();
}; };
break; break;
case BOOL_CALLBACK: case BOOL_CALLBACK:
try { try
{
bool int_val = (bool)std::stoi(argv[++arg_number]); bool int_val = (bool)std::stoi(argv[++arg_number]);
((void (*)(bool))this_option->callback)(int_val); ((void (*)(bool))this_option->callback)(int_val);
} catch (std::exception &exc) { }
catch (std::exception &exc)
{
std::cout << "A bool option can only receive 0 or 1" << std::endl; std::cout << "A bool option can only receive 0 or 1" << std::endl;
throw std::exception(); throw std::exception();
}; };
break; break;
case STRING_CALLBACK: case STRING_CALLBACK:
try { try
{
((void (*)(std::string))this_option->callback)(argv[++arg_number]); ((void (*)(std::string))this_option->callback)(argv[++arg_number]);
} catch (std::exception &exc) { }
catch (std::exception &exc)
{
throw std::exception(); throw std::exception();
}; };
break; break;
@ -132,26 +150,33 @@ void option_list::parse(int argc, char **argv) {
} }
} }
if (std::string("--help") == std::string(argv[arg_number]) || if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number])) { std::string("-h") == std::string(argv[arg_number]))
{
help(); help();
} }
} }
} }
void option_list::help() { void option_list::help()
{
std::cout << program_name << std::endl; std::cout << program_name << std::endl;
std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl; std::cout << " -h [ --help ] \t\tDisplay this help message" << std::endl;
for (std::vector<option_t>::iterator this_option = internal_list.begin(); for (std::vector<option_t>::iterator this_option = internal_list.begin();
this_option != internal_list.end(); this_option != internal_list.end();
this_option++) { this_option++)
{
std::string help_line(" "); std::string help_line(" ");
if (this_option->shortform == "-") { if (this_option->shortform == "-")
{
help_line += this_option->longform + " "; help_line += this_option->longform + " ";
} else { }
else
{
help_line += this_option->shortform + " [ " + this_option->longform + " ]"; help_line += this_option->shortform + " [ " + this_option->longform + " ]";
} }
switch (help_line.size() / 8) { switch (help_line.size() / 8)
{
case 0: case 0:
help_line += "\t\t\t\t"; help_line += "\t\t\t\t";
break; break;

View File

@ -36,7 +36,8 @@ typedef enum
STRING, STRING,
} VOLK_OPTYPE; } VOLK_OPTYPE;
class option_t { class option_t
{
public: public:
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)()); option_t(std::string longform, std::string shortform, std::string msg, void (*callback)());
option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int)); option_t(std::string longform, std::string shortform, std::string msg, void (*callback)(int));
@ -51,7 +52,6 @@ public:
VOLK_OPTYPE option_type; VOLK_OPTYPE option_type;
std::string printval; std::string printval;
void (*callback)(); void (*callback)();
}; };
class option_list class option_list
@ -64,6 +64,7 @@ public:
void parse(int argc, char **argv); void parse(int argc, char **argv);
void help(); void help();
private: private:
std::string program_name; std::string program_name;
std::vector<option_t> internal_list; std::vector<option_t> internal_list;

View File

@ -34,7 +34,6 @@
#include <vector> // for vector, vector<>::const_.. #include <vector> // for vector, vector<>::const_..
namespace fs = boost::filesystem; namespace fs = boost::filesystem;
volk_gnsssdr_test_params_t test_params(1e-6f, 327.f, 8111, 1987, false, ""); volk_gnsssdr_test_params_t test_params(1e-6f, 327.f, 8111, 1987, false, "");
@ -75,9 +74,11 @@ int main(int argc, char *argv[])
return 1; return 1;
} }
for (int arg_number = 0; arg_number < argc; ++arg_number) { for (int arg_number = 0; arg_number < argc; ++arg_number)
{
if (std::string("--help") == std::string(argv[arg_number]) || if (std::string("--help") == std::string(argv[arg_number]) ||
std::string("-h") == std::string(argv[arg_number])) { std::string("-h") == std::string(argv[arg_number]))
{
return 0; return 0;
} }
} }
@ -85,19 +86,24 @@ int main(int argc, char *argv[])
std::ofstream json_file; std::ofstream json_file;
std::string config_file; std::string config_file;
if ( json_filename != "" ) { if (json_filename != "")
{
json_file.open(json_filename.c_str()); json_file.open(json_filename.c_str());
} }
if ( volk_config_path != "" ) { if (volk_config_path != "")
{
config_file = volk_config_path + "/volk_config"; config_file = volk_config_path + "/volk_config";
} }
// Run tests // Run tests
std::vector<volk_gnsssdr_test_results_t> results; std::vector<volk_gnsssdr_test_results_t> results;
if(update_mode) { if (update_mode)
if( config_file != "" ) read_results(&results, config_file); {
else read_results(&results); if (config_file != "")
read_results(&results, config_file);
else
read_results(&results);
} }
// Initialize the list of tests // Initialize the list of tests
@ -105,35 +111,43 @@ int main(int argc, char *argv[])
// Iterate through list of tests running each one // Iterate through list of tests running each one
std::string substr_to_match(test_params.kernel_regex()); std::string substr_to_match(test_params.kernel_regex());
for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { for (unsigned int ii = 0; ii < test_cases.size(); ++ii)
{
bool regex_match = true; bool regex_match = true;
volk_gnsssdr_test_case_t test_case = test_cases[ii]; volk_gnsssdr_test_case_t test_case = test_cases[ii];
// if the kernel name matches regex then do the test // if the kernel name matches regex then do the test
std::string test_case_name = test_case.name(); std::string test_case_name = test_case.name();
if(test_case_name.find(substr_to_match) == std::string::npos) { if (test_case_name.find(substr_to_match) == std::string::npos)
{
regex_match = false; regex_match = false;
} }
// if we are in update mode check if we've already got results // if we are in update mode check if we've already got results
// if we have any, then no need to test that kernel // if we have any, then no need to test that kernel
bool update = true; bool update = true;
if(update_mode) { if (update_mode)
for(unsigned int jj=0; jj < results.size(); ++jj) { {
for (unsigned int jj = 0; jj < results.size(); ++jj)
{
if (results[jj].name == test_case.name() || if (results[jj].name == test_case.name() ||
results[jj].name == test_case.puppet_master_name()) { results[jj].name == test_case.puppet_master_name())
{
update = false; update = false;
break; break;
} }
} }
} }
if( regex_match && update ) { if (regex_match && update)
try { {
try
{
run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
test_case.test_parameters(), &results, test_case.puppet_master_name()); test_case.test_parameters(), &results, test_case.puppet_master_name());
} }
catch (std::string &error) { catch (std::string &error)
{
std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl; std::cerr << "Caught Exception in 'run_volk_gnssdr_tests': " << error << std::endl;
} }
} }
@ -141,16 +155,21 @@ int main(int argc, char *argv[])
// Output results according to provided options // Output results according to provided options
if(json_filename != "") { if (json_filename != "")
{
write_json(json_file, results); write_json(json_file, results);
json_file.close(); json_file.close();
} }
if(!dry_run) { if (!dry_run)
if(config_file != "") write_results(&results, false, config_file); {
else write_results(&results, false); if (config_file != "")
write_results(&results, false, config_file);
else
write_results(&results, false);
} }
else { else
{
std::cout << "Warning: this was a dry-run. Config not generated" << std::endl; std::cout << "Warning: this was a dry-run. Config not generated" << std::endl;
} }
} }
@ -169,11 +188,13 @@ void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string
struct stat buffer; struct stat buffer;
bool config_status = (stat(path.c_str(), &buffer) == 0); bool config_status = (stat(path.c_str(), &buffer) == 0);
if( config_status ) { if (config_status)
{
// a config exists and we are reading results from it // a config exists and we are reading results from it
std::ifstream config(path.c_str()); std::ifstream config(path.c_str());
char config_line[256]; char config_line[256];
while(config.getline(config_line, 255)) { while (config.getline(config_line, 255))
{
// tokenize the input line by kernel_name unaligned aligned // tokenize the input line by kernel_name unaligned aligned
// then push back in the results vector with fields filled in // then push back in the results vector with fields filled in
@ -184,13 +205,15 @@ void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string
found = config_str.find(' '); found = config_str.find(' ');
// Split line by spaces // Split line by spaces
while(found && found < str_size) { while (found && found < str_size)
{
found = config_str.find(' '); found = config_str.find(' ');
// kernel names MUST be less than 128 chars, which is // kernel names MUST be less than 128 chars, which is
// a length restricted by volk/volk_prefs.c // a length restricted by volk/volk_prefs.c
// on the last token in the parsed string we won't find a space // on the last token in the parsed string we won't find a space
// so make sure we copy at most 128 chars. // so make sure we copy at most 128 chars.
if(found > 127) { if (found > 127)
{
found = 127; found = 127;
} }
str_size = config_str.size(); str_size = config_str.size();
@ -201,7 +224,8 @@ void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string
config_str.erase(0, found + 1); config_str.erase(0, found + 1);
} }
if(single_kernel_result.size() == 3) { if (single_kernel_result.size() == 3)
{
volk_gnsssdr_test_results_t kernel_result; volk_gnsssdr_test_results_t kernel_result;
kernel_result.name = std::string(single_kernel_result[0]); kernel_result.name = std::string(single_kernel_result[0]);
kernel_result.config_name = std::string(single_kernel_result[0]); kernel_result.config_name = std::string(single_kernel_result[0]);
@ -211,7 +235,6 @@ void read_results(std::vector<volk_gnsssdr_test_results_t> *results, std::string
} }
} }
} }
} }
void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result) void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool update_result)
@ -234,17 +257,21 @@ void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool
} }
std::ofstream config; std::ofstream config;
if(update_result) { if (update_result)
{
std::cout << "Updating " << path << " ..." << std::endl; std::cout << "Updating " << path << " ..." << std::endl;
config.open(path.c_str(), std::ofstream::app); config.open(path.c_str(), std::ofstream::app);
if (!config.is_open()) { //either we don't have write access or we don't have the dir yet if (!config.is_open())
{ //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << path << std::endl; std::cout << "Error opening file " << path << std::endl;
} }
} }
else { else
{
std::cout << "Writing " << path << " ..." << std::endl; std::cout << "Writing " << path << " ..." << std::endl;
config.open(path.c_str()); config.open(path.c_str());
if (!config.is_open()) { //either we don't have write access or we don't have the dir yet if (!config.is_open())
{ //either we don't have write access or we don't have the dir yet
std::cout << "Error opening file " << path << std::endl; std::cout << "Error opening file " << path << std::endl;
} }
@ -255,7 +282,8 @@ void write_results(const std::vector<volk_gnsssdr_test_results_t> *results, bool
} }
std::vector<volk_gnsssdr_test_results_t>::const_iterator profile_results; std::vector<volk_gnsssdr_test_results_t>::const_iterator profile_results;
for(profile_results = results->begin(); profile_results != results->end(); ++profile_results) { for (profile_results = results->begin(); profile_results != results->end(); ++profile_results)
{
config << profile_results->config_name << " " config << profile_results->config_name << " "
<< profile_results->best_arch_a << " " << profile_results->best_arch_a << " "
<< profile_results->best_arch_u << std::endl; << profile_results->best_arch_u << std::endl;
@ -270,7 +298,8 @@ void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_
size_t len = results.size(); size_t len = results.size();
size_t i = 0; size_t i = 0;
std::vector<volk_gnsssdr_test_results_t>::iterator result; std::vector<volk_gnsssdr_test_results_t>::iterator result;
for(result = results.begin(); result != results.end(); ++result) { for (result = results.begin(); result != results.end(); ++result)
{
json_file << " {" << std::endl; json_file << " {" << std::endl;
json_file << " \"name\": \"" << result->name << "\"," << std::endl; json_file << " \"name\": \"" << result->name << "\"," << std::endl;
json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl; json_file << " \"vlen\": " << (int)(result->vlen) << "," << std::endl;
@ -284,14 +313,16 @@ void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_
size_t ri = 0; size_t ri = 0;
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair; std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
for(kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair) { for (kernel_time_pair = result->results.begin(); kernel_time_pair != result->results.end(); ++kernel_time_pair)
{
volk_gnsssdr_test_time_t time = kernel_time_pair->second; volk_gnsssdr_test_time_t time = kernel_time_pair->second;
json_file << " \"" << time.name << "\": {" << std::endl; json_file << " \"" << time.name << "\": {" << std::endl;
json_file << " \"name\": \"" << time.name << "\"," << std::endl; json_file << " \"name\": \"" << time.name << "\"," << std::endl;
json_file << " \"time\": " << time.time << "," << std::endl; json_file << " \"time\": " << time.time << "," << std::endl;
json_file << " \"units\": \"" << time.units << "\"" << std::endl; json_file << " \"units\": \"" << time.units << "\"" << std::endl;
json_file << " }"; json_file << " }";
if(ri+1 != results_len) { if (ri + 1 != results_len)
{
json_file << ","; json_file << ",";
} }
json_file << std::endl; json_file << std::endl;
@ -299,7 +330,8 @@ void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_
} }
json_file << " }" << std::endl; json_file << " }" << std::endl;
json_file << " }"; json_file << " }";
if(i+1 != len) { if (i + 1 != len)
{
json_file << ","; json_file << ",";
} }
json_file << std::endl; json_file << std::endl;
@ -308,5 +340,3 @@ void write_json(std::ofstream &json_file, std::vector<volk_gnsssdr_test_results_
json_file << " ]" << std::endl; json_file << " ]" << std::endl;
json_file << "}" << std::endl; json_file << "}" << std::endl;
} }

View File

@ -40,19 +40,22 @@ _mm256_complexmul_ps(__m256 x, __m256 y)
} }
static inline __m256 static inline __m256
_mm256_conjugate_ps(__m256 x){ _mm256_conjugate_ps(__m256 x)
{
const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f); const __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
return _mm256_xor_ps(x, conjugator); // conjugate y return _mm256_xor_ps(x, conjugator); // conjugate y
} }
static inline __m256 static inline __m256
_mm256_complexconjugatemul_ps(__m256 x, __m256 y){ _mm256_complexconjugatemul_ps(__m256 x, __m256 y)
{
y = _mm256_conjugate_ps(y); y = _mm256_conjugate_ps(y);
return _mm256_complexmul_ps(x, y); return _mm256_complexmul_ps(x, y);
} }
static inline __m256 static inline __m256
_mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){ _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2)
{
__m256 complex1, complex2; __m256 complex1, complex2;
cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
@ -61,7 +64,8 @@ _mm256_magnitudesquared_ps(__m256 cplxValue1, __m256 cplxValue2){
return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values return _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
} }
static inline __m256 _mm256_complexnormalise_ps( __m256 z ){ static inline __m256 _mm256_complexnormalise_ps(__m256 z)
{
__m256 tmp1 = _mm256_mul_ps(z, z); __m256 tmp1 = _mm256_mul_ps(z, z);
__m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1); __m256 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8); tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
@ -70,7 +74,8 @@ static inline __m256 _mm256_complexnormalise_ps( __m256 z ){
} }
static inline __m256 static inline __m256
_mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2){ _mm256_magnitude_ps(__m256 cplxValue1, __m256 cplxValue2)
{
return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2)); return _mm256_sqrt_ps(_mm256_magnitudesquared_ps(cplxValue1, cplxValue2));
} }

View File

@ -91,7 +91,9 @@
// FIXME: due to the usage of complex.h, require gcc for c-linkage // FIXME: due to the usage of complex.h, require gcc for c-linkage
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
#if defined(__cplusplus) && (__GNUC__) #if defined(__cplusplus) && (__GNUC__)
# define __VOLK_DECL_BEGIN extern "C" { #define __VOLK_DECL_BEGIN \
extern "C" \
{
#define __VOLK_DECL_END } #define __VOLK_DECL_END }
#else #else
#define __VOLK_DECL_BEGIN #define __VOLK_DECL_BEGIN
@ -121,7 +123,8 @@
#endif #endif
#endif #endif
union bit128{ union bit128
{
uint8_t i8[16]; uint8_t i8[16];
uint16_t i16[8]; uint16_t i16[8];
uint32_t i[4]; uint32_t i[4];
@ -138,7 +141,8 @@ union bit128{
#endif #endif
}; };
union bit256{ union bit256
{
uint8_t i8[32]; uint8_t i8[32];
uint16_t i16[16]; uint16_t i16[16];
uint32_t i[8]; uint32_t i[8];

View File

@ -55,19 +55,27 @@ typedef std::complex<int64_t> lv_64sc_t;
typedef std::complex<float> lv_32fc_t; typedef std::complex<float> lv_32fc_t;
typedef std::complex<double> lv_64fc_t; typedef std::complex<double> lv_64fc_t;
template <typename T> inline std::complex<T> lv_cmake(const T &r, const T &i){ template <typename T>
inline std::complex<T> lv_cmake(const T &r, const T &i)
{
return std::complex<T>(r, i); return std::complex<T>(r, i);
} }
template <typename T> inline typename T::value_type lv_creal(const T &x){ template <typename T>
inline typename T::value_type lv_creal(const T &x)
{
return x.real(); return x.real();
} }
template <typename T> inline typename T::value_type lv_cimag(const T &x){ template <typename T>
inline typename T::value_type lv_cimag(const T &x)
{
return x.imag(); return x.imag();
} }
template <typename T> inline T lv_conj(const T &x){ template <typename T>
inline T lv_conj(const T &x)
{
return std::conj(x); return std::conj(x);
} }

View File

@ -48,14 +48,16 @@ _mm_complexconjugatemul_ps(__m128 x, __m128 y)
} }
static inline __m128 static inline __m128
_mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
{
cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values return _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
} }
static inline __m128 static inline __m128
_mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2){ _mm_magnitude_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
{
return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2)); return _mm_sqrt_ps(_mm_magnitudesquared_ps_sse3(cplxValue1, cplxValue2));
} }

View File

@ -27,7 +27,8 @@
#include <xmmintrin.h> #include <xmmintrin.h>
static inline __m128 static inline __m128
_mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){ _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
{
__m128 iValue, qValue; __m128 iValue, qValue;
// Arrange in i1i2i3i4 format // Arrange in i1i2i3i4 format
iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0)); iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
@ -39,7 +40,8 @@ _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2){
} }
static inline __m128 static inline __m128
_mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2){ _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
{
return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2)); return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
} }

View File

@ -279,4 +279,3 @@ static inline void volk_gnsssdr_16i_resamplerxnpuppet_16i_neon(int16_t* result,
#endif #endif
#endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H #endif // INCLUDED_volk_gnsssdr_16i_resamplerpuppet_16i_H

View File

@ -107,7 +107,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse4_1(int16_t** resul
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -173,7 +174,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse4_1(int16_t** resul
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -240,7 +242,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_sse3(int16_t** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -310,7 +313,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_sse3(int16_t** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -379,7 +383,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_a_avx(int16_t** result,
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -456,7 +461,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_u_avx(int16_t** result,
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -531,7 +537,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_; int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0); const int32x4_t zeros = vdupq_n_s32(0);
@ -539,7 +546,8 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i; int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx; uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
@ -605,4 +613,3 @@ static inline void volk_gnsssdr_16i_xn_resampler_16i_xn_neon(int16_t** result, c
#endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_16i_xn_resampler_16i_xn_H*/

View File

@ -192,7 +192,8 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result; lv_16sc_t* _out = result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -206,11 +207,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc
// phase rotation registers // phase rotation registers
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
__m128i pc1, pc2; __m128i pc1, pc2;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -290,7 +293,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = cacc[n_vec]; a = cacc[n_vec];
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0); dotProduct = lv_cmake(0, 0);
@ -597,7 +599,8 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result; lv_16sc_t* _out = result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* cacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -611,11 +614,13 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc
// phase rotation registers // phase rotation registers
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
__m128i pc1, pc2; __m128i pc1, pc2;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -695,7 +700,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
{ {
a = cacc[n_vec]; a = cacc[n_vec];
_mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, a); // Store the results back into the dot product vector
dotProduct = lv_cmake(0, 0); dotProduct = lv_cmake(0, 0);
@ -755,7 +759,8 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
lv_16sc_t dotProduct = lv_cmake(0, 0); lv_16sc_t dotProduct = lv_cmake(0, 0);
__m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -780,8 +785,10 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc
_phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc)); _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc));
#endif #endif
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; lv_32fc_t four_phase_inc[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_acc[4];
for (n = 0; n < 4; ++n) for (n = 0; n < 4; ++n)
{ {
four_phase_inc[n] = _phase_inc; four_phase_inc[n] = _phase_inc;
@ -885,7 +892,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
@ -907,7 +913,8 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
lv_16sc_t dotProduct = lv_cmake(0, 0); lv_16sc_t dotProduct = lv_cmake(0, 0);
__m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* cacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -932,8 +939,10 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc
_phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc)); _phase_inc /= hypotf(lv_creal(_phase_inc), lv_cimag(_phase_inc));
#endif #endif
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; lv_32fc_t four_phase_inc[4];
__VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_acc[4];
for (n = 0; n < 4; ++n) for (n = 0; n < 4; ++n)
{ {
four_phase_inc[n] = _phase_inc; four_phase_inc[n] = _phase_inc;
@ -1037,7 +1046,6 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
@ -1596,5 +1604,3 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn_u_avx2(lv_16sc
//#endif [> LV_HAVE_NEON <] //#endif [> LV_HAVE_NEON <]
#endif /*INCLUDED_volk_gnsssdr_16ic_16i_dot_prod_16ic_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_16ic_16i_dot_prod_16ic_xn_H*/

View File

@ -379,6 +379,3 @@ static inline void volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_u_avx2(lv_
//#endif // NEON //#endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic_H

View File

@ -231,4 +231,3 @@ static inline void volk_gnsssdr_16ic_conjugate_16ic_u_avx2(lv_16sc_t* cVector, c
//#endif /* LV_HAVE_NEON */ //#endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */ #endif /* INCLUDED_volk_gnsssdr_16ic_conjugate_16ic_H */

View File

@ -97,7 +97,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
lv_16sc_t* _result = result; lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
__m128 _rem_code_phase, _code_phase_step_chips; __m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
@ -105,13 +106,15 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
@ -124,9 +127,11 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_a_sse2(lv_16sc_t* resul
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_load_ps(init_idx_float); __m128 _4output_index = _mm_load_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_load_ps(init_4constant_float); __m128 _4constant_float = _mm_load_ps(init_4constant_float);
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
@ -177,7 +182,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
lv_16sc_t* _result = result; lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
__m128 _rem_code_phase, _code_phase_step_chips; __m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
@ -185,13 +191,15 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
_rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = _mm_load1_ps(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
@ -204,9 +212,11 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_u_sse2(lv_16sc_t* resul
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_loadu_ps(init_idx_float); __m128 _4output_index = _mm_loadu_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float); __m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)
@ -257,7 +267,8 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
lv_16sc_t* _result = result; lv_16sc_t* _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float32x4_t _rem_code_phase, _code_phase_step_chips; float32x4_t _rem_code_phase, _code_phase_step_chips;
int32x4_t _code_length_chips, _code_length_chips_minus1; int32x4_t _code_length_chips, _code_length_chips_minus1;
float32x4_t _code_phase_out, _code_phase_out_with_offset; float32x4_t _code_phase_out, _code_phase_out_with_offset;
@ -266,13 +277,15 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
_rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register _rem_code_phase = vld1q_dup_f32(&rem_code_phase_chips); //load float to all four float values in m128 register
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
@ -285,9 +298,11 @@ static inline void volk_gnsssdr_16ic_resampler_fast_16ic_neon(lv_16sc_t* result,
uint32x4_t negative_indexes, overflow_indexes; uint32x4_t negative_indexes, overflow_indexes;
int32x4_t zero = vmovq_n_s32(0); int32x4_t zero = vmovq_n_s32(0);
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
float32x4_t _4output_index = vld1q_f32(init_idx_float); float32x4_t _4output_index = vld1q_f32(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
float32x4_t _4constant_float = vld1q_f32(init_4constant_float); float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
for (number = 0; number < quarterPoints; number++) for (number = 0; number < quarterPoints; number++)

View File

@ -141,11 +141,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
unsigned int number; unsigned int number;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result; __m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -232,7 +234,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3(lv_16sc_t* out
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
@ -244,11 +245,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
unsigned int j; unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result; __m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -385,7 +388,6 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_a_sse3_reload(lv_16sc
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
#ifdef LV_HAVE_SSE3 #ifdef LV_HAVE_SSE3
#include <pmmintrin.h> #include <pmmintrin.h>
@ -395,11 +397,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3(lv_16sc_t* out
unsigned int number; unsigned int number;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result; __m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -498,11 +502,13 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_u_sse3_reload(lv_16sc
unsigned int j; unsigned int j;
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result; __m128i c1, c2, result;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -657,8 +663,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
lv_16sc_t* _out = outVector; lv_16sc_t* _out = outVector;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -667,8 +675,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
@ -745,8 +755,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon(lv_16sc_t* outVe
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);
@ -791,8 +803,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
lv_16sc_t* _out = outVector; lv_16sc_t* _out = outVector;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -801,8 +815,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
@ -879,8 +895,10 @@ static inline void volk_gnsssdr_16ic_s32fc_x2_rotator_16ic_neon_reload(lv_16sc_t
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);

View File

@ -96,7 +96,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t* out, con
if (sse_iters > 0) if (sse_iters > 0)
{ {
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
realcacc = _mm_setzero_si128(); realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128();
@ -174,7 +175,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t* out, con
if (sse_iters > 0) if (sse_iters > 0)
{ {
__m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
realcacc = _mm_setzero_si128(); realcacc = _mm_setzero_si128();
imagcacc = _mm_setzero_si128(); imagcacc = _mm_setzero_si128();
@ -253,7 +255,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_u_axv2(lv_16sc_t* out, con
if (avx_iters > 0) if (avx_iters > 0)
{ {
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
realcacc = _mm256_setzero_si256(); realcacc = _mm256_setzero_si256();
imagcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256();
@ -330,7 +333,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_a_axv2(lv_16sc_t* out, con
if (avx_iters > 0) if (avx_iters > 0)
{ {
__m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result; __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl, realcacc, imagcacc, result;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
realcacc = _mm256_setzero_si256(); realcacc = _mm256_setzero_si256();
imagcacc = _mm256_setzero_si256(); imagcacc = _mm256_setzero_si256();
@ -407,7 +411,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon(lv_16sc_t* out, const
// 2nd lane holds the imaginary part // 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, c_val, accumulator; int16x4x2_t a_val, b_val, c_val, accumulator;
int16x4x2_t tmp_real, tmp_imag; int16x4x2_t tmp_real, tmp_imag;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator.val[0] = vdup_n_s16(0); accumulator.val[0] = vdup_n_s16(0);
accumulator.val[1] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0);
lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0); lv_16sc_t dotProduct = lv_cmake((int16_t)0, (int16_t)0);
@ -474,7 +479,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t* out, c
// 2nd lane holds the imaginary part // 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, accumulator; int16x4x2_t a_val, b_val, accumulator;
int16x4x2_t tmp; int16x4x2_t tmp;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator.val[0] = vdup_n_s16(0); accumulator.val[0] = vdup_n_s16(0);
accumulator.val[1] = vdup_n_s16(0); accumulator.val[1] = vdup_n_s16(0);
@ -526,7 +532,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t* out
// 2nd lane holds the imaginary part // 2nd lane holds the imaginary part
int16x4x2_t a_val, b_val, accumulator1, accumulator2; int16x4x2_t a_val, b_val, accumulator1, accumulator2;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t accum_result[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t accum_result[4];
accumulator1.val[0] = vdup_n_s16(0); accumulator1.val[0] = vdup_n_s16(0);
accumulator1.val[1] = vdup_n_s16(0); accumulator1.val[1] = vdup_n_s16(0);
accumulator2.val[0] = vdup_n_s16(0); accumulator2.val[0] = vdup_n_s16(0);

View File

@ -125,7 +125,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_sse2(lv_16sc_t* resul
if (sse_iters > 0) if (sse_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -219,7 +220,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_sse2(lv_16sc_t* resul
if (sse_iters > 0) if (sse_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -313,7 +315,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_a_avx2(lv_16sc_t* resul
if (sse_iters > 0) if (sse_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -407,7 +410,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_u_avx2(lv_16sc_t* resul
if (sse_iters > 0) if (sse_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
__m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* imagcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -501,7 +505,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon(lv_16sc_t* result,
if (neon_iters > 0) if (neon_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val, c_val; int16x4x2_t a_val, b_val, c_val;
@ -589,7 +594,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_vma(lv_16sc_t* res
if (neon_iters > 0) if (neon_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val, tmp; int16x4x2_t a_val, b_val, tmp;
@ -666,7 +672,8 @@ static inline void volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_neon_optvma(lv_16sc_t*
if (neon_iters > 0) if (neon_iters > 0)
{ {
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
int16x4x2_t a_val, b_val; int16x4x2_t a_val, b_val;

View File

@ -262,5 +262,3 @@ static inline void volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_neon_optvma(lv_16sc
#endif // NEON #endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_x2_dotprodxnpuppet_16ic_H

View File

@ -292,7 +292,6 @@ static inline void volk_gnsssdr_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out, con
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
#ifdef LV_HAVE_NEON #ifdef LV_HAVE_NEON
#include <arm_neon.h> #include <arm_neon.h>

View File

@ -191,7 +191,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result; lv_16sc_t* _out = result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -210,11 +211,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3(lv_16sc_
// phase rotation registers // phase rotation registers
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
__m128i pc1, pc2; __m128i pc1, pc2;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -369,7 +372,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result; lv_16sc_t* _out = result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -388,11 +392,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
// phase rotation registers // phase rotation registers
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
__m128i pc1, pc2; __m128i pc1, pc2;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -594,7 +600,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_sse3_reload(l
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -615,7 +620,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
const lv_16sc_t* _in_common = in_common; const lv_16sc_t* _in_common = in_common;
lv_16sc_t* _out = result; lv_16sc_t* _out = result;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
__m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* realcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
__m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment()); __m128i* imagcacc = (__m128i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128i), volk_gnsssdr_get_alignment());
@ -634,11 +640,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_u_sse3(lv_16sc_
// phase rotation registers // phase rotation registers
__m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg; __m128 pa, pb, two_phase_acc_reg, two_phase_inc_reg;
__m128i pc1, pc2; __m128i pc1, pc2;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_loadu_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_loadu_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_loadu_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_loadu_ps((float*)two_phase_acc);
@ -781,7 +789,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
lv_16sc_t dotProduct = lv_cmake(0, 0); lv_16sc_t dotProduct = lv_cmake(0, 0);
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -798,11 +807,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result1, result2; __m128i c1, c2, result1, result2;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -966,7 +977,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2(lv_16sc_
sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp))); sat_adds16i(lv_cimag(_out[n_vec]), lv_cimag(tmp)));
} }
} }
} }
#endif /* LV_HAVE_AVX2 */ #endif /* LV_HAVE_AVX2 */
@ -989,7 +999,8 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
lv_16sc_t tmp16; lv_16sc_t tmp16;
lv_32fc_t tmp32; lv_32fc_t tmp32;
__VOLK_ATTR_ALIGNED(32) lv_16sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(32)
lv_16sc_t dotProductVector[8];
lv_16sc_t dotProduct = lv_cmake(0, 0); lv_16sc_t dotProduct = lv_cmake(0, 0);
__m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment()); __m256i* realcacc = (__m256i*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256i), volk_gnsssdr_get_alignment());
@ -1006,11 +1017,13 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_a_avx2_reload(l
__m128 a, b, two_phase_acc_reg, two_phase_inc_reg; __m128 a, b, two_phase_acc_reg, two_phase_inc_reg;
__m128i c1, c2, result1, result2; __m128i c1, c2, result1, result2;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -1312,8 +1325,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
float phase_est; float phase_est;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -1322,14 +1337,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
int16x4x2_t a_val, b_val, c_val; int16x4x2_t a_val, b_val, c_val;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
float32x4_t half = vdupq_n_f32(0.5f); float32x4_t half = vdupq_n_f32(0.5f);
int16x4x2_t tmp16; int16x4x2_t tmp16;
int32x4x2_t tmp32i; int32x4x2_t tmp32i;
@ -1426,8 +1444,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon(lv_16sc_t*
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);
@ -1495,8 +1515,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
float phase_est; float phase_est;
//printf("arg phase0: %f", arg_phase0); //printf("arg phase0: %f", arg_phase0);
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -1505,14 +1527,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
int16x4x2_t a_val, b_val; int16x4x2_t a_val, b_val;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
float32x4_t half = vdupq_n_f32(0.5f); float32x4_t half = vdupq_n_f32(0.5f);
int16x4x2_t tmp16; int16x4x2_t tmp16;
int32x4x2_t tmp32i; int32x4x2_t tmp32i;
@ -1589,8 +1614,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);
@ -1605,7 +1632,6 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_vma(lv_16s
//_phase_real = vmulq_f32(_phase_real, Round); //_phase_real = vmulq_f32(_phase_real, Round);
//_phase_imag = vmulq_f32(_phase_imag, Round); //_phase_imag = vmulq_f32(_phase_imag, Round);
//printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0])); //printf("After %i: %f,%f, %f\n\n", number, _phase_real[0], _phase_imag[0], sqrt(_phase_real[0]*_phase_real[0]+_phase_imag[0]*_phase_imag[0]));
} }
for (n_vec = 0; n_vec < num_a_vectors; n_vec++) for (n_vec = 0; n_vec < num_a_vectors; n_vec++)
@ -1686,8 +1712,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
float phase_est; float phase_est;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -1696,14 +1724,17 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
int16x4x2_t a_val, b_val; int16x4x2_t a_val, b_val;
__VOLK_ATTR_ALIGNED(16) lv_16sc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(16)
lv_16sc_t dotProductVector[4];
float32x4_t half = vdupq_n_f32(0.5f); float32x4_t half = vdupq_n_f32(0.5f);
int32x4x2_t tmp32i; int32x4x2_t tmp32i;
@ -1782,8 +1813,10 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((*phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((*phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);
@ -1842,4 +1875,3 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn_neon_optvma(lv_
#endif /* LV_HAVE_NEON */ #endif /* LV_HAVE_NEON */
#endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_16ic_x2_dot_prod_16ic_xn_H*/

View File

@ -379,5 +379,3 @@ static inline void volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_neon_vma(lv
#endif // NEON #endif // NEON
#endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H #endif // INCLUDED_volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic_H

View File

@ -106,7 +106,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse4_1(lv_16sc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -172,7 +173,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse4_1(lv_16sc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -239,7 +241,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_sse3(lv_16sc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -309,7 +312,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_sse3(lv_16sc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -378,7 +382,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_a_avx(lv_16sc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -455,7 +460,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_u_avx(lv_16sc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -530,7 +536,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_; int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0); const int32x4_t zeros = vdupq_n_s32(0);
@ -538,7 +545,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i; int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx; uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
@ -604,4 +612,3 @@ static inline void volk_gnsssdr_16ic_xn_resampler_16ic_xn_neon(lv_16sc_t** resul
#endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_16ic_xn_resampler_16ic_xn_H*/

View File

@ -102,20 +102,23 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t** _result = result; lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips; float tmp_rem_code_phase_chips;
__m128 _rem_code_phase, _code_phase_step_chips; __m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
@ -128,9 +131,11 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_a_sse2(lv_16sc_t*
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_load_ps(init_idx_float); __m128 _4output_index = _mm_load_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_load_ps(init_4constant_float); __m128 _4constant_float = _mm_load_ps(init_4constant_float);
int current_vector = 0; int current_vector = 0;
@ -193,20 +198,23 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
const unsigned int quarterPoints = num_output_samples / 4; const unsigned int quarterPoints = num_output_samples / 4;
lv_16sc_t** _result = result; lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips; float tmp_rem_code_phase_chips;
__m128 _rem_code_phase, _code_phase_step_chips; __m128 _rem_code_phase, _code_phase_step_chips;
__m128i _code_length_chips, _code_length_chips_minus1; __m128i _code_length_chips, _code_length_chips_minus1;
__m128 _code_phase_out, _code_phase_out_with_offset; __m128 _code_phase_out, _code_phase_out_with_offset;
_code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register _code_phase_step_chips = _mm_load1_ps(&code_phase_step_chips); //load float to all four float values in m128 register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
@ -219,9 +227,11 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_u_sse2(lv_16sc_t*
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
__m128 _4output_index = _mm_loadu_ps(init_idx_float); __m128 _4output_index = _mm_loadu_ps(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
__m128 _4constant_float = _mm_loadu_ps(init_4constant_float); __m128 _4constant_float = _mm_loadu_ps(init_4constant_float);
int current_vector = 0; int current_vector = 0;
@ -285,7 +295,8 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
float32x4_t half = vdupq_n_f32(0.5f); float32x4_t half = vdupq_n_f32(0.5f);
lv_16sc_t** _result = result; lv_16sc_t** _result = result;
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
float tmp_rem_code_phase_chips; float tmp_rem_code_phase_chips;
float32x4_t _rem_code_phase, _code_phase_step_chips; float32x4_t _rem_code_phase, _code_phase_step_chips;
int32x4_t _code_length_chips, _code_length_chips_minus1; int32x4_t _code_length_chips, _code_length_chips_minus1;
@ -293,13 +304,15 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
float32x4_t sign, PlusHalf, Round; float32x4_t sign, PlusHalf, Round;
_code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register _code_phase_step_chips = vld1q_dup_f32(&code_phase_step_chips); //load float to all four float values in float32x4_t register
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips_minus1[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips_minus1[4];
four_times_code_length_chips_minus1[0] = code_length_chips - 1; four_times_code_length_chips_minus1[0] = code_length_chips - 1;
four_times_code_length_chips_minus1[1] = code_length_chips - 1; four_times_code_length_chips_minus1[1] = code_length_chips - 1;
four_times_code_length_chips_minus1[2] = code_length_chips - 1; four_times_code_length_chips_minus1[2] = code_length_chips - 1;
four_times_code_length_chips_minus1[3] = code_length_chips - 1; four_times_code_length_chips_minus1[3] = code_length_chips - 1;
__VOLK_ATTR_ALIGNED(16) int four_times_code_length_chips[4]; __VOLK_ATTR_ALIGNED(16)
int four_times_code_length_chips[4];
four_times_code_length_chips[0] = code_length_chips; four_times_code_length_chips[0] = code_length_chips;
four_times_code_length_chips[1] = code_length_chips; four_times_code_length_chips[1] = code_length_chips;
four_times_code_length_chips[2] = code_length_chips; four_times_code_length_chips[2] = code_length_chips;
@ -312,9 +325,11 @@ static inline void volk_gnsssdr_16ic_xn_resampler_fast_16ic_xn_neon(lv_16sc_t**
uint32x4_t negative_indexes, overflow_indexes; uint32x4_t negative_indexes, overflow_indexes;
int32x4_t zero = vmovq_n_s32(0); int32x4_t zero = vmovq_n_s32(0);
__VOLK_ATTR_ALIGNED(16) float init_idx_float[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
float init_idx_float[4] = {0.0f, 1.0f, 2.0f, 3.0f};
float32x4_t _4output_index = vld1q_f32(init_idx_float); float32x4_t _4output_index = vld1q_f32(init_idx_float);
__VOLK_ATTR_ALIGNED(16) float init_4constant_float[4] = { 4.0f, 4.0f, 4.0f, 4.0f }; __VOLK_ATTR_ALIGNED(16)
float init_4constant_float[4] = {4.0f, 4.0f, 4.0f, 4.0f};
float32x4_t _4constant_float = vld1q_f32(init_4constant_float); float32x4_t _4constant_float = vld1q_f32(init_4constant_float);
int current_vector = 0; int current_vector = 0;

View File

@ -29,7 +29,6 @@
*/ */
/*! /*!
* \page volk_gnsssdr_32f_index_max_32u.h * \page volk_gnsssdr_32f_index_max_32u.h
* *
@ -80,12 +79,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_avx(uint32_t* target, const
__m256 compareResults; __m256 compareResults;
__m256 currentValues; __m256 currentValues;
__VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; float maxValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32)
float maxIndexesBuffer[8];
for (; number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm256_load_ps(inputPtr); inputPtr += 8; currentValues = _mm256_load_ps(inputPtr);
inputPtr += 8;
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
@ -143,12 +145,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_avx(uint32_t* target, const
__m256 compareResults; __m256 compareResults;
__m256 currentValues; __m256 currentValues;
__VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8]; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8]; float maxValuesBuffer[8];
__VOLK_ATTR_ALIGNED(32)
float maxIndexesBuffer[8];
for (; number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm256_loadu_ps(inputPtr); inputPtr += 8; currentValues = _mm256_loadu_ps(inputPtr);
inputPtr += 8;
currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e); compareResults = _mm256_cmp_ps(maxValues, currentValues, 0x1e);
maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults); maxValuesIndex = _mm256_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
@ -206,12 +211,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse4_1(uint32_t* target, con
__m128 compareResults; __m128 compareResults;
__m128 currentValues; __m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for (; number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm_load_ps(inputPtr); inputPtr += 4; currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
@ -269,12 +277,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse4_1(uint32_t* target, con
__m128 compareResults; __m128 compareResults;
__m128 currentValues; __m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for (; number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; currentValues = _mm_loadu_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults); maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
@ -333,12 +344,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_a_sse(uint32_t* target, const
__m128 compareResults; __m128 compareResults;
__m128 currentValues; __m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for (; number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm_load_ps(inputPtr); inputPtr += 4; currentValues = _mm_load_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes)); maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes));
@ -397,12 +411,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_u_sse(uint32_t* target, const
__m128 compareResults; __m128 compareResults;
__m128 currentValues; __m128 currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for (; number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = _mm_loadu_ps(inputPtr); inputPtr += 4; currentValues = _mm_loadu_ps(inputPtr);
inputPtr += 4;
currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues); currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
compareResults = _mm_cmpgt_ps(maxValues, currentValues); compareResults = _mm_cmpgt_ps(maxValues, currentValues);
maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes)); maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex), _mm_andnot_ps(compareResults, currentIndexes));
@ -476,7 +493,8 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
float* inputPtr = (float*)src0; float* inputPtr = (float*)src0;
float32x4_t indexIncrementValues = vdupq_n_f32(4); float32x4_t indexIncrementValues = vdupq_n_f32(4);
__VOLK_ATTR_ALIGNED(16) float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f }; __VOLK_ATTR_ALIGNED(16)
float currentIndexes_float[4] = {-4.0f, -3.0f, -2.0f, -1.0f};
float32x4_t currentIndexes = vld1q_f32(currentIndexes_float); float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
float max = src0[0]; float max = src0[0];
@ -487,12 +505,15 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
uint32x4_t currentIndexes_u; uint32x4_t currentIndexes_u;
float32x4_t currentValues; float32x4_t currentValues;
__VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4]; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4]; float maxValuesBuffer[4];
__VOLK_ATTR_ALIGNED(16)
float maxIndexesBuffer[4];
for (; number < quarterPoints; number++) for (; number < quarterPoints; number++)
{ {
currentValues = vld1q_f32(inputPtr); inputPtr += 4; currentValues = vld1q_f32(inputPtr);
inputPtr += 4;
currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues); currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
currentIndexes_u = vcvtq_u32_f32(currentIndexes); currentIndexes_u = vcvtq_u32_f32(currentIndexes);
compareResults = vcgtq_f32(maxValues, currentValues); compareResults = vcgtq_f32(maxValues, currentValues);
@ -528,4 +549,3 @@ static inline void volk_gnsssdr_32f_index_max_32u_neon(uint32_t* target, const f
#endif /*LV_HAVE_NEON*/ #endif /*LV_HAVE_NEON*/
#endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/ #endif /*INCLUDED_volk_gnsssdr_32f_index_max_32u_H*/

View File

@ -42,7 +42,6 @@
#include <string.h> #include <string.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points) static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_generic(float* result, const float* local_code, unsigned int num_points)
{ {
@ -276,4 +275,3 @@ static inline void volk_gnsssdr_32f_resamplerxnpuppet_32f_neon(float* result, co
#endif #endif
#endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H #endif // INCLUDED_volk_gnsssdr_32f_resamplerpuppet_32f_H

View File

@ -268,26 +268,44 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
__m128i emm0, emm2, emm4; __m128i emm0, emm2, emm4;
/* declare some SSE constants */ /* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; __VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
for (; number < sse_iters; number++) for (; number < sse_iters; number++)
{ {
@ -397,7 +415,6 @@ static inline void volk_gnsssdr_32f_sincos_32fc_a_sse2(lv_32fc_t* out, const flo
_in = *aPtr++; _in = *aPtr++;
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */
@ -421,26 +438,44 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
__m128i emm0, emm2, emm4; __m128i emm0, emm2, emm4;
/* declare some SSE constants */ /* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; __VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
for (; number < sse_iters; number++) for (; number < sse_iters; number++)
{ {
@ -550,7 +585,6 @@ static inline void volk_gnsssdr_32f_sincos_32fc_u_sse2(lv_32fc_t* out, const flo
_in = *aPtr++; _in = *aPtr++;
*bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in)); *bPtr++ = lv_cmake((float)cosf(_in), (float)sinf(_in));
} }
} }
#endif /* LV_HAVE_SSE2 */ #endif /* LV_HAVE_SSE2 */

View File

@ -110,7 +110,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse3(float** result, c
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -180,7 +181,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse3(float** result, c
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -248,7 +250,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_sse4_1(float** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -314,7 +317,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_sse4_1(float** result,
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -380,7 +384,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_a_avx(float** result, co
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -457,7 +462,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_u_avx(float** result, co
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -536,7 +542,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_; int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0); const int32x4_t zeros = vdupq_n_s32(0);
@ -544,7 +551,8 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i; int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx; uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);
@ -606,5 +614,3 @@ static inline void volk_gnsssdr_32f_xn_resampler_32f_xn_neon(float** result, con
#endif #endif
#endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/ #endif /*INCLUDED_volk_gnsssdr_32f_xn_resampler_32f_xn_H*/

View File

@ -204,7 +204,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
// Set up the complex rotator // Set up the complex rotator
__m256 z0, z1, z2, z3; __m256 z0, z1, z2, z3;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t phase_vec[16];
for (vec_ind = 0; vec_ind < 16; ++vec_ind) for (vec_ind = 0; vec_ind < 16; ++vec_ind)
{ {
phase_vec[vec_ind] = _phase; phase_vec[vec_ind] = _phase;
@ -216,7 +217,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
z2 = _mm256_load_ps((float*)(phase_vec + 8)); z2 = _mm256_load_ps((float*)(phase_vec + 8));
z3 = _mm256_load_ps((float*)(phase_vec + 12)); z3 = _mm256_load_ps((float*)(phase_vec + 12));
lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; lv_32fc_t dz = phase_inc;
dz *= dz;
dz *= dz;
dz *= dz;
dz *= dz; // dz = phase_inc^16;
for (vec_ind = 0; vec_ind < 4; ++vec_ind) for (vec_ind = 0; vec_ind < 4; ++vec_ind)
{ {
@ -282,7 +287,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_
aPtr += 32; aPtr += 32;
} }
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
@ -362,7 +368,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
// Set up the complex rotator // Set up the complex rotator
__m256 z0, z1, z2, z3; __m256 z0, z1, z2, z3;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t phase_vec[16]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t phase_vec[16];
for (vec_ind = 0; vec_ind < 16; ++vec_ind) for (vec_ind = 0; vec_ind < 16; ++vec_ind)
{ {
phase_vec[vec_ind] = _phase; phase_vec[vec_ind] = _phase;
@ -374,7 +381,11 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
z2 = _mm256_load_ps((float*)(phase_vec + 8)); z2 = _mm256_load_ps((float*)(phase_vec + 8));
z3 = _mm256_load_ps((float*)(phase_vec + 12)); z3 = _mm256_load_ps((float*)(phase_vec + 12));
lv_32fc_t dz = phase_inc; dz *= dz; dz *= dz; dz *= dz; dz *= dz; // dz = phase_inc^16; lv_32fc_t dz = phase_inc;
dz *= dz;
dz *= dz;
dz *= dz;
dz *= dz; // dz = phase_inc^16;
for (vec_ind = 0; vec_ind < 4; ++vec_ind) for (vec_ind = 0; vec_ind < 4; ++vec_ind)
{ {
@ -386,7 +397,6 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
for (; number < sixteenthPoints; number++) for (; number < sixteenthPoints; number++)
{ {
a0Val = _mm256_load_ps(aPtr); a0Val = _mm256_load_ps(aPtr);
a1Val = _mm256_load_ps(aPtr + 8); a1Val = _mm256_load_ps(aPtr + 8);
a2Val = _mm256_load_ps(aPtr + 16); a2Val = _mm256_load_ps(aPtr + 16);
@ -441,7 +451,8 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
aPtr += 32; aPtr += 32;
} }
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind) for (vec_ind = 0; vec_ind < num_a_vectors; ++vec_ind)
{ {
@ -482,5 +493,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_
#endif /* LV_HAVE_AVX */ #endif /* LV_HAVE_AVX */
#endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn_H */

View File

@ -159,4 +159,3 @@ static inline void volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_a_avx(lv_3
#endif // AVX #endif // AVX
#endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H #endif // INCLUDED_volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc_H

View File

@ -82,8 +82,10 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector
for (i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip // Clip
@ -135,8 +137,10 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_sse(lv_16sc_t* outputVector,
for (i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip // Clip
@ -186,8 +190,10 @@ static inline void volk_gnsssdr_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector
for (i = 0; i < avx2_iters; i++) for (i = 0; i < avx2_iters; i++)
{ {
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip // Clip
@ -240,8 +246,10 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector
for (i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip // Clip
@ -291,8 +299,10 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_sse(lv_16sc_t* outputVector,
for (i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
// Clip // Clip
@ -343,8 +353,10 @@ static inline void volk_gnsssdr_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector
for (i = 0; i < avx2_iters; i++) for (i = 0; i < avx2_iters; i++)
{ {
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 16);
// Clip // Clip
@ -399,8 +411,10 @@ static inline void volk_gnsssdr_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
for (i = 0; i < neon_iters; i++) for (i = 0; i < neon_iters; i++)
{ {
a = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)(inputVectorPtr));
b = vld1q_f32((const float32_t*)(inputVectorPtr)); inputVectorPtr += 4; inputVectorPtr += 4;
b = vld1q_f32((const float32_t*)(inputVectorPtr));
inputVectorPtr += 4;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 8);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);

View File

@ -109,10 +109,14 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_avx2(lv_8sc_t* outputVector,
for (i = 0; i < avx2_iters; i++) for (i = 0; i < avx2_iters; i++)
{ {
inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal3 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal4 = _mm256_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
@ -179,10 +183,14 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_avx2(lv_8sc_t* outputVector,
for (i = 0; i < avx2_iters; i++) for (i = 0; i < avx2_iters; i++)
{ {
inputVal1 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
inputVal2 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal3 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
inputVal4 = _mm256_load_ps((float*)inputVectorPtr); inputVectorPtr += 8; inputVectorPtr += 8;
inputVal3 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
inputVal4 = _mm256_load_ps((float*)inputVectorPtr);
inputVectorPtr += 8;
__VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32); __VOLK_GNSSSDR_PREFETCH(inputVectorPtr + 32);
inputVal1 = _mm256_mul_ps(inputVal1, vmax_val); inputVal1 = _mm256_mul_ps(inputVal1, vmax_val);
@ -249,10 +257,14 @@ static inline void volk_gnsssdr_32fc_convert_8ic_u_sse2(lv_8sc_t* outputVector,
for (i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
inputVal2 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal3 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal4 = _mm_loadu_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, vmax_val); inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm_mul_ps(inputVal2, vmax_val); inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
@ -315,10 +327,14 @@ static inline void volk_gnsssdr_32fc_convert_8ic_a_sse2(lv_8sc_t* outputVector,
for (i = 0; i < sse_iters; i++) for (i = 0; i < sse_iters; i++)
{ {
inputVal1 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal1 = _mm_load_ps((float*)inputVectorPtr);
inputVal2 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps((float*)inputVectorPtr);
inputVal4 = _mm_load_ps((float*)inputVectorPtr); inputVectorPtr += 4; inputVectorPtr += 4;
inputVal3 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal4 = _mm_load_ps((float*)inputVectorPtr);
inputVectorPtr += 4;
inputVal1 = _mm_mul_ps(inputVal1, vmax_val); inputVal1 = _mm_mul_ps(inputVal1, vmax_val);
inputVal2 = _mm_mul_ps(inputVal2, vmax_val); inputVal2 = _mm_mul_ps(inputVal2, vmax_val);
@ -385,7 +401,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
for (i = 0; i < neon_iters; i++) for (i = 0; i < neon_iters; i++)
{ {
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val); a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -394,7 +411,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
toint_a = vcvtq_s32_f32(Round); toint_a = vcvtq_s32_f32(Round);
intInputVal1 = vqmovn_s32(toint_a); intInputVal1 = vqmovn_s32(toint_a);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val); a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -406,7 +424,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2); pack16_8_1 = vcombine_s16(intInputVal1, intInputVal2);
res8_1 = vqmovn_s16(pack16_8_1); res8_1 = vqmovn_s16(pack16_8_1);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val); a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));
@ -415,7 +434,8 @@ static inline void volk_gnsssdr_32fc_convert_8ic_neon(lv_8sc_t* outputVector, co
toint_a = vcvtq_s32_f32(Round); toint_a = vcvtq_s32_f32(Round);
intInputVal1 = vqmovn_s32(toint_a); intInputVal1 = vqmovn_s32(toint_a);
a = vld1q_f32((const float32_t*)inputVectorPtr); inputVectorPtr += 4; a = vld1q_f32((const float32_t*)inputVectorPtr);
inputVectorPtr += 4;
a = vmulq_f32(a, max_val); a = vmulq_f32(a, max_val);
ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val); ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31))); sign = vcvtq_f32_u32((vshrq_n_u32(vreinterpretq_u32_f32(ret1), 31)));

View File

@ -42,7 +42,6 @@
#include <string.h> #include <string.h>
#ifdef LV_HAVE_GENERIC #ifdef LV_HAVE_GENERIC
static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points) static inline void volk_gnsssdr_32fc_resamplerxnpuppet_32fc_generic(lv_32fc_t* result, const lv_32fc_t* local_code, unsigned int num_points)
{ {

View File

@ -179,7 +179,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
const lv_32fc_t** _in_a = in_a; const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t dotProductVector[2];
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
@ -191,11 +192,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_sse3(lv_32fc_
// phase rotation registers // phase rotation registers
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -288,7 +291,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
const lv_32fc_t** _in_a = in_a; const lv_32fc_t** _in_a = in_a;
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t dotProductVector[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t dotProductVector[2];
__m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment()); __m128* acc = (__m128*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m128), volk_gnsssdr_get_alignment());
@ -300,11 +304,13 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_sse3(lv_32fc_
// phase rotation registers // phase rotation registers
__m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1; __m128 a, two_phase_acc_reg, two_phase_inc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z1;
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_inc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_inc[2];
two_phase_inc[0] = phase_inc * phase_inc; two_phase_inc[0] = phase_inc * phase_inc;
two_phase_inc[1] = phase_inc * phase_inc; two_phase_inc[1] = phase_inc * phase_inc;
two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc); two_phase_inc_reg = _mm_load_ps((float*)two_phase_inc);
__VOLK_ATTR_ALIGNED(16) lv_32fc_t two_phase_acc[2]; __VOLK_ATTR_ALIGNED(16)
lv_32fc_t two_phase_acc[2];
two_phase_acc[0] = (*phase); two_phase_acc[0] = (*phase);
two_phase_acc[1] = (*phase) * phase_inc; two_phase_acc[1] = (*phase) * phase_inc;
two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc); two_phase_acc_reg = _mm_load_ps((float*)two_phase_acc);
@ -398,7 +404,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_u_avx(lv_32fc_t
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
lv_32fc_t _phase = (*phase); lv_32fc_t _phase = (*phase);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
@ -525,7 +532,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
const lv_32fc_t* _in_common = in_common; const lv_32fc_t* _in_common = in_common;
lv_32fc_t _phase = (*phase); lv_32fc_t _phase = (*phase);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
__m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment()); __m256* acc = (__m256*)volk_gnsssdr_malloc(num_a_vectors * sizeof(__m256), volk_gnsssdr_get_alignment());
@ -538,7 +546,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
// phase rotation registers // phase rotation registers
__m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z; __m256 a, four_phase_acc_reg, yl, yh, tmp1, tmp1p, tmp2, tmp2p, z;
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_inc[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_inc[4];
const lv_32fc_t phase_inc2 = phase_inc * phase_inc; const lv_32fc_t phase_inc2 = phase_inc * phase_inc;
const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc; const lv_32fc_t phase_inc3 = phase_inc2 * phase_inc;
const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc; const lv_32fc_t phase_inc4 = phase_inc3 * phase_inc;
@ -548,7 +557,8 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_a_avx(lv_32fc_t
four_phase_inc[3] = phase_inc4; four_phase_inc[3] = phase_inc4;
const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc); const __m256 four_phase_inc_reg = _mm256_load_ps((float*)four_phase_inc);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t four_phase_acc[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t four_phase_acc[4];
four_phase_acc[0] = _phase; four_phase_acc[0] = _phase;
four_phase_acc[1] = _phase * phase_inc; four_phase_acc[1] = _phase * phase_inc;
four_phase_acc[2] = _phase * phase_inc2; four_phase_acc[2] = _phase * phase_inc2;
@ -662,8 +672,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
float32_t phase_est; float32_t phase_est;
lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc; lv_32fc_t ___phase4 = phase_inc * phase_inc * phase_inc * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_real[4] = { lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase4_imag[4] = { lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4) }; float32_t __phase4_real[4] = {lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4), lv_creal(___phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase4_imag[4] = {lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4), lv_cimag(___phase4)};
float32x4_t _phase4_real = vld1q_f32(__phase4_real); float32x4_t _phase4_real = vld1q_f32(__phase4_real);
float32x4_t _phase4_imag = vld1q_f32(__phase4_imag); float32x4_t _phase4_imag = vld1q_f32(__phase4_imag);
@ -672,13 +684,16 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
lv_32fc_t phase3 = phase2 * phase_inc; lv_32fc_t phase3 = phase2 * phase_inc;
lv_32fc_t phase4 = phase3 * phase_inc; lv_32fc_t phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t __phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t __phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t __phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t __phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
float32x4_t _phase_real = vld1q_f32(__phase_real); float32x4_t _phase_real = vld1q_f32(__phase_real);
float32x4_t _phase_imag = vld1q_f32(__phase_imag); float32x4_t _phase_imag = vld1q_f32(__phase_imag);
__VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4]; __VOLK_ATTR_ALIGNED(32)
lv_32fc_t dotProductVector[4];
float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag; float32x4x2_t a_val, b_val, tmp32_real, tmp32_imag;
@ -728,8 +743,10 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
phase3 = phase2 * phase_inc; phase3 = phase2 * phase_inc;
phase4 = phase3 * phase_inc; phase4 = phase3 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_real[4] = { lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4) }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float32_t ____phase_imag[4] = { lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4) }; float32_t ____phase_real[4] = {lv_creal((_phase)), lv_creal(phase2), lv_creal(phase3), lv_creal(phase4)};
__VOLK_ATTR_ALIGNED(16)
float32_t ____phase_imag[4] = {lv_cimag((_phase)), lv_cimag(phase2), lv_cimag(phase3), lv_cimag(phase4)};
_phase_real = vld1q_f32(____phase_real); _phase_real = vld1q_f32(____phase_real);
_phase_imag = vld1q_f32(____phase_imag); _phase_imag = vld1q_f32(____phase_imag);
@ -786,4 +803,3 @@ static inline void volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_neon(lv_32fc_t*
#endif /* LV_HAVE_NEON */ #endif /* LV_HAVE_NEON */
#endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */ #endif /* INCLUDED_volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn_H */

View File

@ -107,7 +107,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse3(lv_32fc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -177,7 +178,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse3(lv_32fc_t** res
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -245,7 +247,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_sse4_1(lv_32fc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -311,7 +314,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_sse4_1(lv_32fc_t** r
const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips); const __m128 rem_code_phase_chips_reg = _mm_set_ps1(rem_code_phase_chips);
const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips); const __m128 code_phase_step_chips_reg = _mm_set_ps1(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int local_code_chip_index[4];
int local_code_chip_index_; int local_code_chip_index_;
const __m128i zeros = _mm_setzero_si128(); const __m128i zeros = _mm_setzero_si128();
@ -377,7 +381,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx(lv_32fc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -454,7 +459,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx(lv_32fc_t** resu
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -531,7 +537,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_u_avx2(lv_32fc_t** res
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -609,7 +616,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_a_avx2(lv_32fc_t** res
const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips); const __m256 rem_code_phase_chips_reg = _mm256_set1_ps(rem_code_phase_chips);
const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips); const __m256 code_phase_step_chips_reg = _mm256_set1_ps(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(32) int local_code_chip_index[8]; __VOLK_ATTR_ALIGNED(32)
int local_code_chip_index[8];
int local_code_chip_index_; int local_code_chip_index_;
const __m256 zeros = _mm256_setzero_ps(); const __m256 zeros = _mm256_setzero_ps();
@ -689,7 +697,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips); const float32x4_t rem_code_phase_chips_reg = vdupq_n_f32(rem_code_phase_chips);
const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips); const float32x4_t code_phase_step_chips_reg = vdupq_n_f32(code_phase_step_chips);
__VOLK_ATTR_ALIGNED(16) int32_t local_code_chip_index[4]; __VOLK_ATTR_ALIGNED(16)
int32_t local_code_chip_index[4];
int32_t local_code_chip_index_; int32_t local_code_chip_index_;
const int32x4_t zeros = vdupq_n_s32(0); const int32x4_t zeros = vdupq_n_s32(0);
@ -697,7 +706,8 @@ static inline void volk_gnsssdr_32fc_xn_resampler_32fc_xn_neon(lv_32fc_t** resul
const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips); const int32x4_t code_length_chips_reg_i = vdupq_n_s32((int32_t)code_length_chips);
int32x4_t local_code_chip_index_reg, aux_i, negatives, i; int32x4_t local_code_chip_index_reg, aux_i, negatives, i;
float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal; float32x4_t aux, aux2, shifts_chips_reg, fi, c, j, cTrunc, base, indexn, reciprocal;
__VOLK_ATTR_ALIGNED(16) const float vec[4] = { 0.0f, 1.0f, 2.0f, 3.0f }; __VOLK_ATTR_ALIGNED(16)
const float vec[4] = {0.0f, 1.0f, 2.0f, 3.0f};
uint32x4_t igx; uint32x4_t igx;
reciprocal = vrecpeq_f32(code_length_chips_reg_f); reciprocal = vrecpeq_f32(code_length_chips_reg_f);
reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(code_length_chips_reg_f, reciprocal), reciprocal);

View File

@ -69,7 +69,8 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_avx(double* result, const
unsigned int i; unsigned int i;
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; __VOLK_ATTR_ALIGNED(32)
double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd(); __m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd(); __m256d aVal = _mm256_setzero_pd();
@ -108,7 +109,8 @@ static inline void volk_gnsssdr_64f_accumulator_64f_u_sse3(double* result,const
unsigned int i; unsigned int i;
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; __VOLK_ATTR_ALIGNED(16)
double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd(); __m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd(); __m128d aVal = _mm_setzero_pd();
@ -164,7 +166,8 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_avx(double* result,const d
unsigned int i; unsigned int i;
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) double tempBuffer[4]; __VOLK_ATTR_ALIGNED(32)
double tempBuffer[4];
__m256d accumulator = _mm256_setzero_pd(); __m256d accumulator = _mm256_setzero_pd();
__m256d aVal = _mm256_setzero_pd(); __m256d aVal = _mm256_setzero_pd();
@ -203,7 +206,8 @@ static inline void volk_gnsssdr_64f_accumulator_64f_a_sse3(double* result,const
unsigned int i; unsigned int i;
const double* aPtr = inputBuffer; const double* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) double tempBuffer[2]; __VOLK_ATTR_ALIGNED(16)
double tempBuffer[2];
__m128d accumulator = _mm_setzero_pd(); __m128d accumulator = _mm_setzero_pd();
__m128d aVal = _mm_setzero_pd(); __m128d aVal = _mm_setzero_pd();

View File

@ -70,7 +70,8 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_sse3(char* result, const ch
unsigned int i; unsigned int i;
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128(); __m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128();
@ -125,7 +126,8 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_sse3(char* result, const ch
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(16) char tempBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char tempBuffer[16];
__m128i accumulator = _mm_setzero_si128(); __m128i accumulator = _mm_setzero_si128();
__m128i aVal = _mm_setzero_si128(); __m128i aVal = _mm_setzero_si128();
@ -164,7 +166,8 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_a_avx2(char* result, const ch
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256(); __m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256(); __m256i aVal = _mm256_setzero_si256();
@ -202,7 +205,8 @@ static inline void volk_gnsssdr_8i_accumulator_s8i_u_avx2(char* result, const ch
unsigned int i; unsigned int i;
const char* aPtr = inputBuffer; const char* aPtr = inputBuffer;
__VOLK_ATTR_ALIGNED(32) char tempBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char tempBuffer[32];
__m256i accumulator = _mm256_setzero_si256(); __m256i accumulator = _mm256_setzero_si256();
__m256i aVal = _mm256_setzero_si256(); __m256i aVal = _mm256_setzero_si256();

View File

@ -74,7 +74,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx2(unsigned int* target, co
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned int mask; unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues; __m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max); maxValues = _mm256_set1_epi8(max);
@ -137,7 +138,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_avx(unsigned int* target, con
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i ones, compareResults, currentValues; __m256i ones, compareResults, currentValues;
__m128i compareResultslo, compareResultshi, maxValues, lo, hi; __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
@ -204,7 +206,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse4_1(unsigned int* target,
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
@ -263,7 +266,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_u_sse2(unsigned int* target, co
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned short mask; unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
@ -351,7 +355,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx2(unsigned int* target, co
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned int mask; unsigned int mask;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i maxValues, compareResults, currentValues; __m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max); maxValues = _mm256_set1_epi8(max);
@ -414,7 +419,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_avx(unsigned int* target, con
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
__VOLK_ATTR_ALIGNED(32) char currentValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char currentValuesBuffer[32];
__m256i ones, compareResults, currentValues; __m256i ones, compareResults, currentValues;
__m128i compareResultslo, compareResultshi, maxValues, lo, hi; __m128i compareResultslo, compareResultshi, maxValues, lo, hi;
@ -481,7 +487,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse4_1(unsigned int* target,
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
@ -540,7 +547,8 @@ static inline void volk_gnsssdr_8i_index_max_16u_a_sse2(unsigned int* target, co
char max = src0[0]; char max = src0[0];
unsigned int index = 0; unsigned int index = 0;
unsigned short mask; unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);

View File

@ -70,7 +70,8 @@ static inline void volk_gnsssdr_8i_max_s8i_u_avx2(char* target, const char* src0
unsigned int i; unsigned int i;
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues; __m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max); maxValues = _mm256_set1_epi8(max);
@ -119,7 +120,8 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse4_1(char* target, const char* sr
unsigned int i; unsigned int i;
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
@ -169,7 +171,8 @@ static inline void volk_gnsssdr_8i_max_s8i_u_sse2(char* target, const char* src0
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned short mask; unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
@ -250,7 +253,8 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse4_1(char* target, const char* sr
unsigned int i; unsigned int i;
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
__VOLK_ATTR_ALIGNED(16) char maxValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char maxValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);
@ -299,7 +303,8 @@ static inline void volk_gnsssdr_8i_max_s8i_a_avx2(char* target, const char* src0
unsigned int i; unsigned int i;
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
__VOLK_ATTR_ALIGNED(32) char maxValuesBuffer[32]; __VOLK_ATTR_ALIGNED(32)
char maxValuesBuffer[32];
__m256i maxValues, compareResults, currentValues; __m256i maxValues, compareResults, currentValues;
maxValues = _mm256_set1_epi8(max); maxValues = _mm256_set1_epi8(max);
@ -349,7 +354,8 @@ static inline void volk_gnsssdr_8i_max_s8i_a_sse2(char* target, const char* src0
char* inputPtr = (char*)src0; char* inputPtr = (char*)src0;
char max = src0[0]; char max = src0[0];
unsigned short mask; unsigned short mask;
__VOLK_ATTR_ALIGNED(16) char currentValuesBuffer[16]; __VOLK_ATTR_ALIGNED(16)
char currentValuesBuffer[16];
__m128i maxValues, compareResults, currentValues; __m128i maxValues, compareResults, currentValues;
maxValues = _mm_set1_epi8(max); maxValues = _mm_set1_epi8(max);

View File

@ -155,7 +155,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_ssse3(lv_8sc_t* cVector, con
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSSE3 */ #endif /* LV_HAVE_SSSE3 */
@ -188,7 +187,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_u_sse3(lv_8sc_t* cVector, cons
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -336,7 +334,6 @@ static inline void volk_gnsssdr_8ic_conjugate_8ic_a_sse3(lv_8sc_t* cVector, cons
{ {
*c++ = lv_conj(*a++); *c++ = lv_conj(*a++);
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */

View File

@ -111,7 +111,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_u_sse3(lv_8sc_t* cVector,
{ {
*c++ = (*a++) * scalar; *c++ = (*a++) * scalar;
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */
@ -204,7 +203,6 @@ static inline void volk_gnsssdr_8ic_s8ic_multiply_8ic_a_sse3(lv_8sc_t* cVector,
{ {
*c++ = (*a++) * scalar; *c++ = (*a++) * scalar;
} }
} }
#endif /* LV_HAVE_SSE3 */ #endif /* LV_HAVE_SSE3 */

View File

@ -165,7 +165,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse2(lv_8sc_t* result, con
totalc = _mm_or_si128(realcacc, imagcacc); totalc = _mm_or_si128(realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
@ -240,7 +241,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_u_sse4_1(lv_8sc_t* result, c
totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1); totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector _mm_storeu_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
@ -317,7 +319,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse2(lv_8sc_t* result, con
totalc = _mm_or_si128(realcacc, imagcacc); totalc = _mm_or_si128(realcacc, imagcacc);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
@ -391,7 +394,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_a_sse4_1(lv_8sc_t* result, c
totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1); totalc = _mm_blendv_epi8(imagcacc, realcacc, mult1);
__VOLK_ATTR_ALIGNED(16) lv_8sc_t dotProductVector[8]; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t dotProductVector[8];
_mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector _mm_store_si128((__m128i*)dotProductVector, totalc); // Store the results back into the dot product vector
@ -446,7 +450,8 @@ static inline void volk_gnsssdr_8ic_x2_dot_prod_8ic_neon(lv_8sc_t* result, const
// for 2-lane vectors, 1st lane holds the real part, // for 2-lane vectors, 1st lane holds the real part,
// 2nd lane holds the imaginary part // 2nd lane holds the imaginary part
int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag; int8x8x2_t a_val, b_val, c_val, accumulator, tmp_real, tmp_imag;
__VOLK_ATTR_ALIGNED(16) lv_8sc_t accum_result[8] = { lv_cmake(0,0) }; __VOLK_ATTR_ALIGNED(16)
lv_8sc_t accum_result[8] = {lv_cmake(0, 0)};
accumulator.val[0] = vdup_n_s8(0); accumulator.val[0] = vdup_n_s8(0);
accumulator.val[1] = vdup_n_s8(0); accumulator.val[1] = vdup_n_s8(0);
unsigned int number; unsigned int number;

View File

@ -241,29 +241,49 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_sse2(lv_32fc_t* out, const fl
__m128i emm0, emm2, emm4; __m128i emm0, emm2, emm4;
/* declare some SSE constants */ /* declare some SSE constants */
__VOLK_ATTR_ALIGNED(16) static const int _ps_inv_sign_mask[4] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _ps_sign_mask[4] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[4] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(16)
static const int _ps_sign_mask[4] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(16) static const float _ps_cephes_FOPI[4] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_1[4] = { 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[4] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_inv1[4] = { ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const int _pi32_2[4] = { 2, 2, 2, 2}; static const int _pi32_1[4] = {1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(16) static const int _pi32_4[4] = { 4, 4, 4, 4}; __VOLK_ATTR_ALIGNED(16)
static const int _pi32_inv1[4] = {~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_2[4] = {2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(16)
static const int _pi32_4[4] = {4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP1[4] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP2[4] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[4] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(16) static const float _ps_minus_cephes_DP3[4] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p0[4] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[4] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p1[4] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_coscof_p2[4] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[4] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p0[4] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p1[4] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[4] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(16) static const float _ps_sincof_p2[4] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) static const float _ps_0p5[4] = { 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[4] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(16) static const float _ps_1[4] = { 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(16)
static const float _ps_coscof_p2[4] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p0[4] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p1[4] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_sincof_p2[4] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_0p5[4] = {0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(16)
static const float _ps_1[4] = {1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(16) float four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; __VOLK_ATTR_ALIGNED(16)
__VOLK_ATTR_ALIGNED(16) float four_phases_inc[4] = { 4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc }; float four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
__VOLK_ATTR_ALIGNED(16)
float four_phases_inc[4] = {4 * phase_inc, 4 * phase_inc, 4 * phase_inc, 4 * phase_inc};
four_phases_reg = _mm_load_ps(four_phases); four_phases_reg = _mm_load_ps(four_phases);
const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc); const __m128 four_phases_inc_reg = _mm_load_ps(four_phases_inc);
@ -456,29 +476,49 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_a_avx2(lv_32fc_t* out, const fl
__m128 aux, c1, s1; __m128 aux, c1, s1;
/* declare some AXX2 constants */ /* declare some AXX2 constants */
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(32)
static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 }; static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 }; __VOLK_ATTR_ALIGNED(32)
static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc};
__VOLK_ATTR_ALIGNED(32)
float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc};
eight_phases_reg = _mm256_load_ps(eight_phases); eight_phases_reg = _mm256_load_ps(eight_phases);
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
@ -624,29 +664,49 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_u_avx2(lv_32fc_t* out, const fl
__m128 aux, c1, s1; __m128 aux, c1, s1;
/* declare some AXX2 constants */ /* declare some AXX2 constants */
__VOLK_ATTR_ALIGNED(32) static const int _ps_inv_sign_mask[8] = { ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _ps_sign_mask[8] = { (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000 }; static const int _ps_inv_sign_mask[8] = {~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000, ~0x80000000};
__VOLK_ATTR_ALIGNED(32)
static const int _ps_sign_mask[8] = {(int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000, (int)0x80000000};
__VOLK_ATTR_ALIGNED(32) static const float _ps_cephes_FOPI[8] = { 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _pi32_1[8] = { 1, 1, 1, 1, 1, 1, 1, 1 }; static const float _ps_cephes_FOPI[8] = {1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516, 1.27323954473516};
__VOLK_ATTR_ALIGNED(32) static const int _pi32_inv1[8] = { ~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const int _pi32_2[8] = { 2, 2, 2, 2, 2, 2, 2, 2 }; static const int _pi32_1[8] = {1, 1, 1, 1, 1, 1, 1, 1};
__VOLK_ATTR_ALIGNED(32) static const int _pi32_4[8] = { 4, 4, 4, 4, 4, 4, 4, 4 }; __VOLK_ATTR_ALIGNED(32)
static const int _pi32_inv1[8] = {~1, ~1, ~1, ~1, ~1, ~1, ~1, ~1};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_2[8] = {2, 2, 2, 2, 2, 2, 2, 2};
__VOLK_ATTR_ALIGNED(32)
static const int _pi32_4[8] = {4, 4, 4, 4, 4, 4, 4, 4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP1[8] = { -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP2[8] = { -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4 }; static const float _ps_minus_cephes_DP1[8] = {-0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625, -0.78515625};
__VOLK_ATTR_ALIGNED(32) static const float _ps_minus_cephes_DP3[8] = { -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p0[8] = { 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005 }; static const float _ps_minus_cephes_DP2[8] = {-2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4, -2.4187564849853515625e-4};
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p1[8] = { -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_coscof_p2[8] = { 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002 }; static const float _ps_minus_cephes_DP3[8] = {-3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8, -3.77489497744594108e-8};
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p0[8] = { -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p1[8] = { 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3 }; static const float _ps_coscof_p0[8] = {2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005, 2.443315711809948E-005};
__VOLK_ATTR_ALIGNED(32) static const float _ps_sincof_p2[8] = { -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1 }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) static const float _ps_0p5[8] = { 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f }; static const float _ps_coscof_p1[8] = {-1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003, -1.388731625493765E-003};
__VOLK_ATTR_ALIGNED(32) static const float _ps_1[8] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; __VOLK_ATTR_ALIGNED(32)
static const float _ps_coscof_p2[8] = {4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002, 4.166664568298827E-002};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p0[8] = {-1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4, -1.9515295891E-4};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p1[8] = {8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3, 8.3321608736E-3};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_sincof_p2[8] = {-1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1, -1.6666654611E-1};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_0p5[8] = {0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f};
__VOLK_ATTR_ALIGNED(32)
static const float _ps_1[8] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
__VOLK_ATTR_ALIGNED(32) float eight_phases[8] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc }; __VOLK_ATTR_ALIGNED(32)
__VOLK_ATTR_ALIGNED(32) float eight_phases_inc[8] = { 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc }; float eight_phases[8] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc, _phase + 4 * phase_inc, _phase + 5 * phase_inc, _phase + 6 * phase_inc, _phase + 7 * phase_inc};
__VOLK_ATTR_ALIGNED(32)
float eight_phases_inc[8] = {8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc, 8 * phase_inc};
eight_phases_reg = _mm256_load_ps(eight_phases); eight_phases_reg = _mm256_load_ps(eight_phases);
const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc); const __m256 eight_phases_inc_reg = _mm256_load_ps(eight_phases_inc);
@ -783,9 +843,11 @@ static inline void volk_gnsssdr_s32f_sincos_32fc_neon(lv_32fc_t* out, const floa
const unsigned int neon_iters = num_points / 4; const unsigned int neon_iters = num_points / 4;
float _phase = (*phase); float _phase = (*phase);
__VOLK_ATTR_ALIGNED(16) float32_t four_phases[4] = { _phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc }; __VOLK_ATTR_ALIGNED(16)
float32_t four_phases[4] = {_phase, _phase + phase_inc, _phase + 2 * phase_inc, _phase + 3 * phase_inc};
float four_inc = 4 * phase_inc; float four_inc = 4 * phase_inc;
__VOLK_ATTR_ALIGNED(16) float32_t four_phases_inc[4] = { four_inc, four_inc, four_inc, four_inc }; __VOLK_ATTR_ALIGNED(16)
float32_t four_phases_inc[4] = {four_inc, four_inc, four_inc, four_inc};
float32x4_t four_phases_reg = vld1q_f32(four_phases); float32x4_t four_phases_reg = vld1q_f32(four_phases);
float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc); float32x4_t four_phases_inc_reg = vld1q_f32(four_phases_inc);

View File

@ -50,7 +50,6 @@
std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t test_params) std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t test_params)
{ {
// Some kernels need a lower tolerance // Some kernels need a lower tolerance
volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(), volk_gnsssdr_test_params_t test_params_inacc = volk_gnsssdr_test_params_t(1e-3, test_params.scalar(),
test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex()); test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex());
@ -98,8 +97,7 @@ std::vector<volk_gnsssdr_test_case_t> init_test_list(volk_gnsssdr_test_params_t
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16)) QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_x2_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_x2_rotator_dot_prod_16ic_xn, test_params_int16))
QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16)) QA(VOLK_INIT_PUPP(volk_gnsssdr_16ic_16i_rotator_dotprodxnpuppet_16ic, volk_gnsssdr_16ic_16i_rotator_dot_prod_16ic_xn, test_params_int16))
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1)) QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_x2_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_x2_rotator_dot_prod_32fc_xn, test_params_int1))
QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1)) QA(VOLK_INIT_PUPP(volk_gnsssdr_32fc_32f_rotator_dotprodxnpuppet_32fc, volk_gnsssdr_32fc_32f_rotator_dot_prod_32fc_xn, test_params_int1));
;
return test_cases; return test_cases;
} }

View File

@ -37,7 +37,8 @@
#include <vector> // for vector #include <vector> // for vector
float uniform() { float uniform()
{
std::random_device r; std::random_device r;
std::default_random_engine e1(r()); std::default_random_engine e1(r());
std::uniform_real_distribution<float> uniform_dist(-1, 1); std::uniform_real_distribution<float> uniform_dist(-1, 1);
@ -60,8 +61,10 @@ void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n)
if (type.is_float) if (type.is_float)
{ {
if(type.size == 8) random_floats<double>((double *)data, n); if (type.size == 8)
else random_floats<float>((float *)data, n); random_floats<double>((double *)data, n);
else
random_floats<float>((float *)data, n);
} }
else else
{ {
@ -75,22 +78,30 @@ void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n)
switch (type.size) switch (type.size)
{ {
case 8: case 8:
if(type.is_signed) ((int64_t *)data)[i] = (int64_t) scaled_rand; if (type.is_signed)
else ((uint64_t *)data)[i] = (uint64_t) scaled_rand; ((int64_t *)data)[i] = (int64_t)scaled_rand;
else
((uint64_t *)data)[i] = (uint64_t)scaled_rand;
break; break;
case 4: case 4:
if(type.is_signed) ((int32_t *)data)[i] = (int32_t) scaled_rand; if (type.is_signed)
else ((uint32_t *)data)[i] = (uint32_t) scaled_rand; ((int32_t *)data)[i] = (int32_t)scaled_rand;
else
((uint32_t *)data)[i] = (uint32_t)scaled_rand;
break; break;
case 2: case 2:
// 16 bit multiplication saturates very fast // 16 bit multiplication saturates very fast
// we produce here only 3 bits input range // we produce here only 3 bits input range
if(type.is_signed) ((int16_t *)data)[i] = (int16_t)((int16_t) scaled_rand % 8); if (type.is_signed)
else ((uint16_t *)data)[i] = (uint16_t) (int16_t)((int16_t) scaled_rand % 8); ((int16_t *)data)[i] = (int16_t)((int16_t)scaled_rand % 8);
else
((uint16_t *)data)[i] = (uint16_t)(int16_t)((int16_t)scaled_rand % 8);
break; break;
case 1: case 1:
if(type.is_signed) ((int8_t *)data)[i] = (int8_t) scaled_rand; if (type.is_signed)
else ((uint8_t *)data)[i] = (uint8_t) scaled_rand; ((int8_t *)data)[i] = (int8_t)scaled_rand;
else
((uint8_t *)data)[i] = (uint8_t)scaled_rand;
break; break;
default: default:
throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here throw "load_random_data: no support for data size > 8 or < 1"; //no shenanigans here
@ -99,17 +110,20 @@ void load_random_data(void *data, volk_gnsssdr_type_t type, unsigned int n)
} }
} }
static std::vector<std::string> get_arch_list(volk_gnsssdr_func_desc_t desc) { static std::vector<std::string> get_arch_list(volk_gnsssdr_func_desc_t desc)
{
std::vector<std::string> archlist; std::vector<std::string> archlist;
for(size_t i = 0; i < desc.n_impls; i++) { for (size_t i = 0; i < desc.n_impls; i++)
{
archlist.push_back(std::string(desc.impl_names[i])); archlist.push_back(std::string(desc.impl_names[i]));
} }
return archlist; return archlist;
} }
volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) { volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name)
{
volk_gnsssdr_type_t type; volk_gnsssdr_type_t type;
type.is_float = false; type.is_float = false;
type.is_scalar = false; type.is_scalar = false;
@ -118,19 +132,22 @@ volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) {
type.size = 0; type.size = 0;
type.str = name; type.str = name;
if(name.size() < 2) { if (name.size() < 2)
{
throw std::string("name too short to be a datatype"); throw std::string("name too short to be a datatype");
} }
//is it a scalar? //is it a scalar?
if(name[0] == 's') { if (name[0] == 's')
{
type.is_scalar = true; type.is_scalar = true;
name = name.substr(1, name.size() - 1); name = name.substr(1, name.size() - 1);
} }
//get the data size //get the data size
size_t last_size_pos = name.find_last_of("0123456789"); size_t last_size_pos = name.find_last_of("0123456789");
if(last_size_pos == std::string::npos) { if (last_size_pos == std::string::npos)
{
throw std::string("no size spec in type ").append(name); throw std::string("no size spec in type ").append(name);
} }
//will throw if malformed //will throw if malformed
@ -139,8 +156,10 @@ volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) {
assert(((size % 8) == 0) && (size <= 64) && (size != 0)); assert(((size % 8) == 0) && (size <= 64) && (size != 0));
type.size = size / 8; //in bytes type.size = size / 8; //in bytes
for(size_t i=last_size_pos+1; i < name.size(); i++) { for (size_t i = last_size_pos + 1; i < name.size(); i++)
switch (name[i]) { {
switch (name[i])
{
case 'f': case 'f':
type.is_float = true; type.is_float = true;
break; break;
@ -163,7 +182,8 @@ volk_gnsssdr_type_t volk_gnsssdr_type_from_string(std::string name) {
static void get_signatures_from_name(std::vector<volk_gnsssdr_type_t> &inputsig, static void get_signatures_from_name(std::vector<volk_gnsssdr_type_t> &inputsig,
std::vector<volk_gnsssdr_type_t> &outputsig, std::vector<volk_gnsssdr_type_t> &outputsig,
std::string name) { std::string name)
{
boost::char_separator<char> sep("_"); boost::char_separator<char> sep("_");
boost::tokenizer<boost::char_separator<char> > tok(name, sep); boost::tokenizer<boost::char_separator<char> > tok(name, sep);
std::vector<std::string> toked; std::vector<std::string> toked;
@ -176,79 +196,107 @@ static void get_signatures_from_name(std::vector<volk_gnsssdr_type_t> &inputsig,
//ok. we're assuming a string in the form //ok. we're assuming a string in the form
//(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment) //(sig)_(multiplier-opt)_..._(name)_(sig)_(multiplier-opt)_..._(alignment)
enum { SIDE_INPUT, SIDE_NAME, SIDE_OUTPUT } side = SIDE_INPUT; enum
{
SIDE_INPUT,
SIDE_NAME,
SIDE_OUTPUT
} side = SIDE_INPUT;
std::string fn_name; std::string fn_name;
volk_gnsssdr_type_t type; volk_gnsssdr_type_t type;
BOOST_FOREACH(std::string token, toked) { BOOST_FOREACH (std::string token, toked)
try { {
try
{
type = volk_gnsssdr_type_from_string(token); type = volk_gnsssdr_type_from_string(token);
if (side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name... if (side == SIDE_NAME) side = SIDE_OUTPUT; //if this is the first one after the name...
if(side == SIDE_INPUT) inputsig.push_back(type); if (side == SIDE_INPUT)
else outputsig.push_back(type); inputsig.push_back(type);
} catch (...){ else
if(token[0] == 'x' && (token.size() > 1) && (token[1] > '0' || token[1] < '9')) { outputsig.push_back(type);
if(side == SIDE_INPUT) assert(inputsig.size() > 0); }
else assert(outputsig.size() > 0); catch (...)
{
if (token[0] == 'x' && (token.size() > 1) && (token[1] > '0' || token[1] < '9'))
{
if (side == SIDE_INPUT)
assert(inputsig.size() > 0);
else
assert(outputsig.size() > 0);
int multiplier = boost::lexical_cast<int>(token.substr(1, token.size() - 1)); //will throw if invalid /////////// int multiplier = boost::lexical_cast<int>(token.substr(1, token.size() - 1)); //will throw if invalid ///////////
for(int i=1; i<multiplier; i++) { for (int i = 1; i < multiplier; i++)
if(side == SIDE_INPUT) inputsig.push_back(inputsig.back()); {
else outputsig.push_back(outputsig.back()); if (side == SIDE_INPUT)
inputsig.push_back(inputsig.back());
else
outputsig.push_back(outputsig.back());
} }
} }
else if(side == SIDE_INPUT) { //it's the function name, at least it better be else if (side == SIDE_INPUT)
{ //it's the function name, at least it better be
side = SIDE_NAME; side = SIDE_NAME;
fn_name.append("_"); fn_name.append("_");
fn_name.append(token); fn_name.append(token);
} }
else if(side == SIDE_OUTPUT) { else if (side == SIDE_OUTPUT)
{
if (token != toked.back()) throw; //the last token in the name is the alignment if (token != toked.back()) throw; //the last token in the name is the alignment
} }
} }
} }
//we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input! //we don't need an output signature (some fn's operate on the input data, "in place"), but we do need at least one input!
assert(inputsig.size() != 0); assert(inputsig.size() != 0);
} }
inline void run_cast_test1(volk_gnsssdr_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test1(volk_gnsssdr_fn_1arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], vlen, arch.c_str()); while (iter--) func(buffs[0], vlen, arch.c_str());
} }
inline void run_cast_test2(volk_gnsssdr_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test2(volk_gnsssdr_fn_2arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], buffs[1], vlen, arch.c_str()); while (iter--) func(buffs[0], buffs[1], vlen, arch.c_str());
} }
inline void run_cast_test3(volk_gnsssdr_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test3(volk_gnsssdr_fn_3arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str()); while (iter--) func(buffs[0], buffs[1], buffs[2], vlen, arch.c_str());
} }
inline void run_cast_test4(volk_gnsssdr_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test4(volk_gnsssdr_fn_4arg func, std::vector<void *> &buffs, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str()); while (iter--) func(buffs[0], buffs[1], buffs[2], buffs[3], vlen, arch.c_str());
} }
inline void run_cast_test1_s32f(volk_gnsssdr_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test1_s32f(volk_gnsssdr_fn_1arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); while (iter--) func(buffs[0], scalar, vlen, arch.c_str());
} }
inline void run_cast_test2_s32f(volk_gnsssdr_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test2_s32f(volk_gnsssdr_fn_2arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
} }
inline void run_cast_test3_s32f(volk_gnsssdr_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test3_s32f(volk_gnsssdr_fn_3arg_s32f func, std::vector<void *> &buffs, float scalar, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
} }
inline void run_cast_test1_s32fc(volk_gnsssdr_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test1_s32fc(volk_gnsssdr_fn_1arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], scalar, vlen, arch.c_str()); while (iter--) func(buffs[0], scalar, vlen, arch.c_str());
} }
inline void run_cast_test2_s32fc(volk_gnsssdr_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test2_s32fc(volk_gnsssdr_fn_2arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str()); while (iter--) func(buffs[0], buffs[1], scalar, vlen, arch.c_str());
} }
inline void run_cast_test3_s32fc(volk_gnsssdr_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch) { inline void run_cast_test3_s32fc(volk_gnsssdr_fn_3arg_s32fc func, std::vector<void *> &buffs, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, std::string arch)
{
while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str()); while (iter--) func(buffs[0], buffs[1], buffs[2], scalar, vlen, arch.c_str());
} }
@ -299,26 +347,32 @@ inline void run_cast_test3_s16ic(volk_gnsssdr_fn_3arg_s16ic func, std::vector<vo
// *************** ADDED BY GNSS-SDR. END // *************** ADDED BY GNSS-SDR. END
template <class t> template <class t>
bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) { bool fcompare(t *in1, t *in2, unsigned int vlen, float tol)
{
bool fail = false; bool fail = false;
int print_max_errs = 10; int print_max_errs = 10;
for(unsigned int i=0; i<vlen; i++) { for (unsigned int i = 0; i < vlen; i++)
{
// for very small numbers we'll see round off errors due to limited // for very small numbers we'll see round off errors due to limited
// precision. So a special test case... // precision. So a special test case...
if(fabs(((t *)(in1))[i]) < 1e-30) { if (fabs(((t *)(in1))[i]) < 1e-30)
{
if (fabs(((t *)(in2))[i]) > tol) if (fabs(((t *)(in2))[i]) > tol)
{ {
fail = true; fail = true;
if(print_max_errs-- > 0) { if (print_max_errs-- > 0)
{
std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
std::cout << " tolerance was: " << tol << std::endl; std::cout << " tolerance was: " << tol << std::endl;
} }
} }
} }
// the primary test is the percent different greater than given tol // the primary test is the percent different greater than given tol
else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) > tol) { else if (fabs(((t *)(in1))[i] - ((t *)(in2))[i]) / fabs(((t *)in1)[i]) > tol)
{
fail = true; fail = true;
if(print_max_errs-- > 0) { if (print_max_errs-- > 0)
{
std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]); std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i]) << " in2: " << t(((t *)(in2))[i]);
std::cout << " tolerance was: " << tol << std::endl; std::cout << " tolerance was: " << tol << std::endl;
} }
@ -329,30 +383,36 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol) {
} }
template <class t> template <class t>
bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) { bool ccompare(t *in1, t *in2, unsigned int vlen, float tol)
{
bool fail = false; bool fail = false;
int print_max_errs = 10; int print_max_errs = 10;
for(unsigned int i=0; i<2*vlen; i+=2) { for (unsigned int i = 0; i < 2 * vlen; i += 2)
{
t diff[2] = {in1[i] - in2[i], in1[i + 1] - in2[i + 1]}; t diff[2] = {in1[i] - in2[i], in1[i + 1] - in2[i + 1]};
t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]); t err = std::sqrt(diff[0] * diff[0] + diff[1] * diff[1]);
t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]); t norm = std::sqrt(in1[i] * in1[i] + in1[i + 1] * in1[i + 1]);
// for very small numbers we'll see round off errors due to limited // for very small numbers we'll see round off errors due to limited
// precision. So a special test case... // precision. So a special test case...
if (norm < 1e-30) { if (norm < 1e-30)
{
if (err > tol) if (err > tol)
{ {
fail = true; fail = true;
if(print_max_errs-- > 0) { if (print_max_errs-- > 0)
{
std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] << "j"; std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] << "j";
std::cout << " tolerance was: " << tol << std::endl; std::cout << " tolerance was: " << tol << std::endl;
} }
} }
} }
// the primary test is the percent different greater than given tol // the primary test is the percent different greater than given tol
else if((err / norm) > tol) { else if ((err / norm) > tol)
{
fail = true; fail = true;
if(print_max_errs-- > 0) { if (print_max_errs-- > 0)
{
std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] << "j"; std::cout << "offset " << i / 2 << " in1: " << in1[i] << " + " << in1[i + 1] << "j in2: " << in2[i] << " + " << in2[i + 1] << "j";
std::cout << " tolerance was: " << tol << std::endl; std::cout << " tolerance was: " << tol << std::endl;
} }
@ -363,13 +423,17 @@ bool ccompare(t *in1, t *in2, unsigned int vlen, float tol) {
} }
template <class t> template <class t>
bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) { bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol)
{
bool fail = false; bool fail = false;
int print_max_errs = 10; int print_max_errs = 10;
for(unsigned int i=0; i<vlen; i++) { for (unsigned int i = 0; i < vlen; i++)
if(((unsigned int)abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i]))) > tol) { {
if (((unsigned int)abs(int(((t *)(in1))[i]) - int(((t *)(in2))[i]))) > tol)
{
fail = true; fail = true;
if(print_max_errs-- > 0) { if (print_max_errs-- > 0)
{
std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i])); std::cout << "offset " << i << " in1: " << static_cast<int>(t(((t *)(in1))[i])) << " in2: " << static_cast<int>(t(((t *)(in2))[i]));
std::cout << " tolerance was: " << tol << std::endl; std::cout << " tolerance was: " << tol << std::endl;
} }
@ -379,21 +443,27 @@ bool icompare(t *in1, t *in2, unsigned int vlen, unsigned int tol) {
return fail; return fail;
} }
class volk_gnsssdr_qa_aligned_mem_pool{ class volk_gnsssdr_qa_aligned_mem_pool
{
public: public:
void *get_new(size_t size){ void *get_new(size_t size)
{
size_t alignment = volk_gnsssdr_get_alignment(); size_t alignment = volk_gnsssdr_get_alignment();
void *ptr = volk_gnsssdr_malloc(size, alignment); void *ptr = volk_gnsssdr_malloc(size, alignment);
memset(ptr, 0x00, size); memset(ptr, 0x00, size);
_mems.push_back(ptr); _mems.push_back(ptr);
return ptr; return ptr;
} }
~volk_gnsssdr_qa_aligned_mem_pool() { ~volk_gnsssdr_qa_aligned_mem_pool()
for(unsigned int ii = 0; ii < _mems.size(); ++ii) { {
for (unsigned int ii = 0; ii < _mems.size(); ++ii)
{
volk_gnsssdr_free(_mems[ii]); volk_gnsssdr_free(_mems[ii]);
} }
} }
private: std::vector<void * > _mems;
private:
std::vector<void *> _mems;
}; };
bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc, bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
@ -401,8 +471,7 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
std::string name, std::string name,
volk_gnsssdr_test_params_t test_params, volk_gnsssdr_test_params_t test_params,
std::vector<volk_gnsssdr_test_results_t> *results, std::vector<volk_gnsssdr_test_results_t> *results,
std::string puppet_master_name std::string puppet_master_name)
)
{ {
return run_volk_gnsssdr_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(), return run_volk_gnsssdr_tests(desc, manual_func, name, test_params.tol(), test_params.scalar(),
test_params.vlen(), test_params.iter(), results, puppet_master_name, test_params.vlen(), test_params.iter(), results, puppet_master_name,
@ -439,7 +508,8 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
//first let's get a list of available architectures for the test //first let's get a list of available architectures for the test
std::vector<std::string> arch_list = get_arch_list(desc); std::vector<std::string> arch_list = get_arch_list(desc);
if((!benchmark_mode) && (arch_list.size() < 2)) { if ((!benchmark_mode) && (arch_list.size() < 2))
{
std::cout << "no architectures to test" << std::endl; std::cout << "no architectures to test" << std::endl;
return false; return false;
} }
@ -449,10 +519,12 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
//now we have to get a function signature by parsing the name //now we have to get a function signature by parsing the name
std::vector<volk_gnsssdr_type_t> inputsig, outputsig; std::vector<volk_gnsssdr_type_t> inputsig, outputsig;
try { try
{
get_signatures_from_name(inputsig, outputsig, name); get_signatures_from_name(inputsig, outputsig, name);
} }
catch (boost::bad_lexical_cast& error) { catch (boost::bad_lexical_cast &error)
{
std::cerr << "Error: unable to get function signature from kernel name" << std::endl; std::cerr << "Error: unable to get function signature from kernel name" << std::endl;
std::cerr << " - " << name << std::endl; std::cerr << " - " << name << std::endl;
return false; return false;
@ -460,30 +532,37 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
//pull the input scalars into their own vector //pull the input scalars into their own vector
std::vector<volk_gnsssdr_type_t> inputsc; std::vector<volk_gnsssdr_type_t> inputsc;
for(size_t i=0; i<inputsig.size(); i++) { for (size_t i = 0; i < inputsig.size(); i++)
if(inputsig[i].is_scalar) { {
if (inputsig[i].is_scalar)
{
inputsc.push_back(inputsig[i]); inputsc.push_back(inputsig[i]);
inputsig.erase(inputsig.begin() + i); inputsig.erase(inputsig.begin() + i);
i -= 1; i -= 1;
} }
} }
std::vector<void *> inbuffs; std::vector<void *> inbuffs;
BOOST_FOREACH(volk_gnsssdr_type_t sig, inputsig) { BOOST_FOREACH (volk_gnsssdr_type_t sig, inputsig)
{
if (!sig.is_scalar) //we don't make buffers for scalars if (!sig.is_scalar) //we don't make buffers for scalars
inbuffs.push_back(mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1))); inbuffs.push_back(mem_pool.get_new(vlen * sig.size * (sig.is_complex ? 2 : 1)));
} }
for(size_t i=0; i<inbuffs.size(); i++) { for (size_t i = 0; i < inbuffs.size(); i++)
{
load_random_data(inbuffs[i], inputsig[i], vlen); load_random_data(inbuffs[i], inputsig[i], vlen);
} }
//ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch //ok let's make a vector of vector of void buffers, which holds the input/output vectors for each arch
std::vector<std::vector<void *> > test_data; std::vector<std::vector<void *> > test_data;
for(size_t i=0; i<arch_list.size(); i++) { for (size_t i = 0; i < arch_list.size(); i++)
{
std::vector<void *> arch_buffs; std::vector<void *> arch_buffs;
for(size_t j=0; j<outputsig.size(); j++) { for (size_t j = 0; j < outputsig.size(); j++)
{
arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size * (outputsig[j].is_complex ? 2 : 1))); arch_buffs.push_back(mem_pool.get_new(vlen * outputsig[j].size * (outputsig[j].is_complex ? 2 : 1)));
} }
for(size_t j=0; j<inputsig.size(); j++) { for (size_t j = 0; j < inputsig.size(); j++)
{
void *arch_inbuff = mem_pool.get_new(vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1)); void *arch_inbuff = mem_pool.get_new(vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
memcpy(arch_inbuff, inbuffs[j], vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1)); memcpy(arch_inbuff, inbuffs[j], vlen * inputsig[j].size * (inputsig[j].is_complex ? 2 : 1));
arch_buffs.push_back(arch_inbuff); arch_buffs.push_back(arch_inbuff);
@ -499,7 +578,8 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
vlen = vlen - vlen_twiddle; vlen = vlen - vlen_twiddle;
std::chrono::time_point<std::chrono::system_clock> start, end; std::chrono::time_point<std::chrono::system_clock> start, end;
std::vector<double> profile_times; std::vector<double> profile_times;
for(size_t i = 0; i < arch_list.size(); i++) { for (size_t i = 0; i < arch_list.size(); i++)
{
start = std::chrono::system_clock::now(); start = std::chrono::system_clock::now();
switch (both_sigs.size()) switch (both_sigs.size())
@ -540,7 +620,8 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
} }
} }
//ADDED BY GNSS-SDR. END //ADDED BY GNSS-SDR. END
else throw "unsupported 1 arg function >1 scalars"; else
throw "unsupported 1 arg function >1 scalars";
break; break;
case 2: case 2:
if (inputsc.size() == 0) if (inputsc.size() == 0)
@ -578,7 +659,8 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
} }
} }
//ADDED BY GNSS-SDR. END //ADDED BY GNSS-SDR. END
else throw "unsupported 2 arg function >1 scalars"; else
throw "unsupported 2 arg function >1 scalars";
break; break;
case 3: case 3:
if (inputsc.size() == 0) if (inputsc.size() == 0)
@ -618,7 +700,8 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
} }
} }
//ADDED BY GNSS-SDR. END //ADDED BY GNSS-SDR. END
else throw "unsupported 3 arg function >1 scalars"; else
throw "unsupported 3 arg function >1 scalars";
break; break;
default: default:
throw "no function handler for this signature"; throw "no function handler for this signature";
@ -642,8 +725,10 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
//and now compare each output to the generic output //and now compare each output to the generic output
//first we have to know which output is the generic one, they aren't in order... //first we have to know which output is the generic one, they aren't in order...
size_t generic_offset = 0; size_t generic_offset = 0;
for(size_t i=0; i<arch_list.size(); i++) { for (size_t i = 0; i < arch_list.size(); i++)
if (arch_list[i] == "generic") { {
if (arch_list[i] == "generic")
{
generic_offset = i; generic_offset = i;
} }
} }
@ -795,9 +880,12 @@ bool run_volk_gnsssdr_tests(volk_gnsssdr_func_desc_t desc,
std::cout << "Best aligned arch: " << best_arch_a << std::endl; std::cout << "Best aligned arch: " << best_arch_a << std::endl;
std::cout << "Best unaligned arch: " << best_arch_u << std::endl; std::cout << "Best unaligned arch: " << best_arch_u << std::endl;
if(puppet_master_name == "NULL") { if (puppet_master_name == "NULL")
{
results->back().config_name = name; results->back().config_name = name;
} else { }
else
{
results->back().config_name = puppet_master_name; results->back().config_name = puppet_master_name;
} }
results->back().best_arch_a = best_arch_a; results->back().best_arch_a = best_arch_a;

View File

@ -35,7 +35,8 @@
/************************************************ /************************************************
* VOLK QA type definitions * * VOLK QA type definitions *
************************************************/ ************************************************/
struct volk_gnsssdr_type_t { struct volk_gnsssdr_type_t
{
bool is_float; bool is_float;
bool is_scalar; bool is_scalar;
bool is_signed; bool is_signed;
@ -44,7 +45,8 @@ struct volk_gnsssdr_type_t {
std::string str; std::string str;
}; };
class volk_gnsssdr_test_time_t { class volk_gnsssdr_test_time_t
{
public: public:
std::string name; std::string name;
double time; double time;
@ -52,7 +54,8 @@ class volk_gnsssdr_test_time_t {
bool pass; bool pass;
}; };
class volk_gnsssdr_test_results_t { class volk_gnsssdr_test_results_t
{
public: public:
std::string name; std::string name;
std::string config_name; std::string config_name;
@ -63,7 +66,8 @@ class volk_gnsssdr_test_results_t {
std::string best_arch_u; std::string best_arch_u;
}; };
class volk_gnsssdr_test_params_t { class volk_gnsssdr_test_params_t
{
private: private:
float _tol; float _tol;
lv_32fc_t _scalar; lv_32fc_t _scalar;
@ -71,12 +75,11 @@ class volk_gnsssdr_test_params_t {
unsigned int _iter; unsigned int _iter;
bool _benchmark_mode; bool _benchmark_mode;
std::string _kernel_regex; std::string _kernel_regex;
public: public:
// ctor // ctor
volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter, volk_gnsssdr_test_params_t(float tol, lv_32fc_t scalar, unsigned int vlen, unsigned int iter,
bool benchmark_mode, std::string kernel_regex) : bool benchmark_mode, std::string kernel_regex) : _tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter), _benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex){};
_tol(tol), _scalar(scalar), _vlen(vlen), _iter(iter),
_benchmark_mode(benchmark_mode), _kernel_regex(kernel_regex) {};
// setters // setters
void set_tol(float tol) { _tol = tol; }; void set_tol(float tol) { _tol = tol; };
void set_scalar(lv_32fc_t scalar) { _scalar = scalar; }; void set_scalar(lv_32fc_t scalar) { _scalar = scalar; };
@ -93,13 +96,15 @@ class volk_gnsssdr_test_params_t {
std::string kernel_regex() { return _kernel_regex; }; std::string kernel_regex() { return _kernel_regex; };
}; };
class volk_gnsssdr_test_case_t { class volk_gnsssdr_test_case_t
{
private: private:
volk_gnsssdr_func_desc_t _desc; volk_gnsssdr_func_desc_t _desc;
void (*_kernel_ptr)(); void (*_kernel_ptr)();
std::string _name; std::string _name;
volk_gnsssdr_test_params_t _test_parameters; volk_gnsssdr_test_params_t _test_parameters;
std::string _puppet_master_name; std::string _puppet_master_name;
public: public:
volk_gnsssdr_func_desc_t desc() { return _desc; }; volk_gnsssdr_func_desc_t desc() { return _desc; };
void (*kernel_ptr())() { return _kernel_ptr; }; void (*kernel_ptr())() { return _kernel_ptr; };
@ -108,16 +113,10 @@ class volk_gnsssdr_test_case_t {
volk_gnsssdr_test_params_t test_parameters() { return _test_parameters; }; volk_gnsssdr_test_params_t test_parameters() { return _test_parameters; };
// normal ctor // normal ctor
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name, volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name,
volk_gnsssdr_test_params_t test_parameters) : volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name("NULL"){};
_desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
_puppet_master_name("NULL")
{};
// ctor for puppets // ctor for puppets
volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name, volk_gnsssdr_test_case_t(volk_gnsssdr_func_desc_t desc, void (*kernel_ptr)(), std::string name,
std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : std::string puppet_master_name, volk_gnsssdr_test_params_t test_parameters) : _desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters), _puppet_master_name(puppet_master_name){};
_desc(desc), _kernel_ptr(kernel_ptr), _name(name), _test_parameters(test_parameters),
_puppet_master_name(puppet_master_name)
{};
}; };
/************************************************ /************************************************
@ -134,8 +133,7 @@ bool run_volk_gnsssdr_tests(
std::string, std::string,
volk_gnsssdr_test_params_t, volk_gnsssdr_test_params_t,
std::vector<volk_gnsssdr_test_results_t> *results = NULL, std::vector<volk_gnsssdr_test_results_t> *results = NULL,
std::string puppet_master_name = "NULL" std::string puppet_master_name = "NULL");
);
bool run_volk_gnsssdr_tests( bool run_volk_gnsssdr_tests(
volk_gnsssdr_func_desc_t, volk_gnsssdr_func_desc_t,
@ -147,12 +145,12 @@ bool run_volk_gnsssdr_tests(
unsigned int, unsigned int,
std::vector<volk_gnsssdr_test_results_t> *results = NULL, std::vector<volk_gnsssdr_test_results_t> *results = NULL,
std::string puppet_master_name = "NULL", std::string puppet_master_name = "NULL",
bool benchmark_mode = false bool benchmark_mode = false);
);
#define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \ #define VOLK_RUN_TESTS(func, tol, scalar, len, iter) \
BOOST_AUTO_TEST_CASE(func##_test) { \ BOOST_AUTO_TEST_CASE(func##_test) \
{ \
BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \ BOOST_CHECK_EQUAL(run_volk_gnsssdr_tests( \
func##_get_func_desc(), (void (*)())func##_manual, \ func##_get_func_desc(), (void (*)())func##_manual, \
std::string(#func), tol, scalar, len, iter, 0, "NULL"), \ std::string(#func), tol, scalar, len, iter, 0, "NULL"), \

View File

@ -49,20 +49,24 @@ int main()
std::vector<std::string> qa_failures; std::vector<std::string> qa_failures;
std::vector<volk_gnsssdr_test_results_t> results; std::vector<volk_gnsssdr_test_results_t> results;
// Test every kernel reporting failures when they occur // Test every kernel reporting failures when they occur
for(unsigned int ii = 0; ii < test_cases.size(); ++ii) { for (unsigned int ii = 0; ii < test_cases.size(); ++ii)
{
bool qa_result = false; bool qa_result = false;
volk_gnsssdr_test_case_t test_case = test_cases[ii]; volk_gnsssdr_test_case_t test_case = test_cases[ii];
try { try
{
qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(), qa_result = run_volk_gnsssdr_tests(test_case.desc(), test_case.kernel_ptr(), test_case.name(),
test_case.test_parameters(), &results, test_case.puppet_master_name()); test_case.test_parameters(), &results, test_case.puppet_master_name());
} }
catch(...) { catch (...)
{
// TODO: what exceptions might we need to catch and how do we handle them? // TODO: what exceptions might we need to catch and how do we handle them?
std::cerr << "Exception found on kernel: " << test_case.name() << std::endl; std::cerr << "Exception found on kernel: " << test_case.name() << std::endl;
qa_result = false; qa_result = false;
} }
if(qa_result) { if (qa_result)
{
std::cerr << "Failure on " << test_case.name() << std::endl; std::cerr << "Failure on " << test_case.name() << std::endl;
qa_failures.push_back(test_case.name()); qa_failures.push_back(test_case.name());
} }
@ -74,9 +78,11 @@ int main()
// Summarize QA results // Summarize QA results
std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of " std::cerr << "Kernel QA finished: " << qa_failures.size() << " failures out of "
<< test_cases.size() << " tests." << std::endl; << test_cases.size() << " tests." << std::endl;
if(qa_failures.size() > 0) { if (qa_failures.size() > 0)
{
std::cerr << "The following kernels failed QA:" << std::endl; std::cerr << "The following kernels failed QA:" << std::endl;
for(unsigned int ii = 0; ii < qa_failures.size(); ++ii) { for (unsigned int ii = 0; ii < qa_failures.size(); ++ii)
{
std::cerr << " " << qa_failures[ii] << std::endl; std::cerr << " " << qa_failures[ii] << std::endl;
} }
qa_ret_val = 1; qa_ret_val = 1;
@ -95,26 +101,28 @@ void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int
qa_file.open(".unittest/kernels.xml"); qa_file.open(".unittest/kernels.xml");
qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl; qa_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << std::endl;
qa_file << "<testsuites name=\"kernels\" " << qa_file << "<testsuites name=\"kernels\" "
"tests=\"" << results.size() << "\" " << << "tests=\"" << results.size() << "\" "
"failures=\"" << nfails << "\" id=\"1\">" << std::endl; << "failures=\"" << nfails << "\" id=\"1\">" << std::endl;
// Results are in a vector by kernel. Each element has a result // Results are in a vector by kernel. Each element has a result
// map containing time and arch name with test result // map containing time and arch name with test result
for(unsigned int ii=0; ii < results.size(); ++ii) { for (unsigned int ii = 0; ii < results.size(); ++ii)
{
volk_gnsssdr_test_results_t result = results[ii]; volk_gnsssdr_test_results_t result = results[ii];
qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl; qa_file << " <testsuite name=\"" << result.name << "\">" << std::endl;
std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair; std::map<std::string, volk_gnsssdr_test_time_t>::iterator kernel_time_pair;
for(kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair) { for (kernel_time_pair = result.results.begin(); kernel_time_pair != result.results.end(); ++kernel_time_pair)
{
volk_gnsssdr_test_time_t test_time = kernel_time_pair->second; volk_gnsssdr_test_time_t test_time = kernel_time_pair->second;
qa_file << " <testcase name=\"" << test_time.name << "\" " << qa_file << " <testcase name=\"" << test_time.name << "\" "
"classname=\"" << result.name << "\" " << << "classname=\"" << result.name << "\" "
"time=\"" << test_time.time << "\">" << std::endl; << "time=\"" << test_time.time << "\">" << std::endl;
if (!test_time.pass) if (!test_time.pass)
qa_file << " <failure " << qa_file << " <failure "
"message=\"fail on arch " << test_time.name << "\">" << << "message=\"fail on arch " << test_time.name << "\">"
"</failure>" << std::endl; << "</failure>" << std::endl;
qa_file << " </testcase>" << std::endl; qa_file << " </testcase>" << std::endl;
} }
qa_file << " </testsuite>" << std::endl; qa_file << " </testsuite>" << std::endl;
@ -123,6 +131,4 @@ void print_qa_xml(std::vector<volk_gnsssdr_test_results_t> results, unsigned int
qa_file << "</testsuites>" << std::endl; qa_file << "</testsuites>" << std::endl;
qa_file.close(); qa_file.close();
} }

View File

@ -51,7 +51,8 @@ void *volk_gnsssdr_malloc(size_t size, size_t alignment)
{ {
fprintf(stderr, fprintf(stderr,
"VOLK_GNSSSDR: Error allocating memory " "VOLK_GNSSSDR: Error allocating memory "
"(posix_memalign: error %d: %s)\n", err, strerror(err)); "(posix_memalign: error %d: %s)\n",
err, strerror(err));
return NULL; return NULL;
} }
} }
@ -112,8 +113,7 @@ volk_gnsssdr_malloc(size_t size, size_t alignment)
return user; return user;
} }
void void volk_gnsssdr_free(void *ptr)
volk_gnsssdr_free(void *ptr)
{ {
struct block_info *info; struct block_info *info;

View File

@ -31,7 +31,8 @@ void volk_gnsssdr_get_config_path(char *path)
//allows config redirection via env variable //allows config redirection via env variable
home = getenv("VOLK_CONFIGPATH"); home = getenv("VOLK_CONFIGPATH");
if(home!=NULL){ if (home != NULL)
{
strncpy(path, home, 512); strncpy(path, home, 512);
strcat(path, suffix2); strcat(path, suffix2);
return; return;

View File

@ -23,7 +23,8 @@
#include <stdbool.h> #include <stdbool.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
int volk_gnsssdr_get_index( int volk_gnsssdr_get_index(

View File

@ -37,13 +37,17 @@ struct volk_gnsssdr_machine *get_machine(void)
if (machine != NULL) if (machine != NULL)
return machine; return machine;
else { else
{
unsigned int max_score = 0; unsigned int max_score = 0;
unsigned int i; unsigned int i;
struct volk_gnsssdr_machine *max_machine = NULL; struct volk_gnsssdr_machine *max_machine = NULL;
for(i=0; i<n_volk_gnsssdr_machines; i++) { for (i = 0; i < n_volk_gnsssdr_machines; i++)
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) { {
if(volk_gnsssdr_machines[i]->caps > max_score) { if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
{
if (volk_gnsssdr_machines[i]->caps > max_score)
{
max_score = volk_gnsssdr_machines[i]->caps; max_score = volk_gnsssdr_machines[i]->caps;
max_machine = volk_gnsssdr_machines[i]; max_machine = volk_gnsssdr_machines[i];
} }
@ -63,8 +67,10 @@ void volk_gnsssdr_list_machines(void)
extern unsigned int n_volk_gnsssdr_machines; extern unsigned int n_volk_gnsssdr_machines;
unsigned int i; unsigned int i;
for(i=0; i<n_volk_gnsssdr_machines; i++) { for (i = 0; i < n_volk_gnsssdr_machines; i++)
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) { {
if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
{
printf("%s;", volk_gnsssdr_machines[i]->name); printf("%s;", volk_gnsssdr_machines[i]->name);
} }
} }
@ -79,13 +85,17 @@ const char* volk_gnsssdr_get_machine(void)
if (machine != NULL) if (machine != NULL)
return machine->name; return machine->name;
else { else
{
unsigned int max_score = 0; unsigned int max_score = 0;
unsigned int i; unsigned int i;
struct volk_gnsssdr_machine *max_machine = NULL; struct volk_gnsssdr_machine *max_machine = NULL;
for(i=0; i<n_volk_gnsssdr_machines; i++) { for (i = 0; i < n_volk_gnsssdr_machines; i++)
if(!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch()))) { {
if(volk_gnsssdr_machines[i]->caps > max_score) { if (!(volk_gnsssdr_machines[i]->caps & (~volk_gnsssdr_get_lvarch())))
{
if (volk_gnsssdr_machines[i]->caps > max_score)
{
max_score = volk_gnsssdr_machines[i]->caps; max_score = volk_gnsssdr_machines[i]->caps;
max_machine = volk_gnsssdr_machines[i]; max_machine = volk_gnsssdr_machines[i];
} }
@ -118,8 +128,7 @@ bool volk_gnsssdr_is_aligned(const void *ptr)
static inline void __${kern.name}_d(${kern.arglist_full}) static inline void __${kern.name}_d(${kern.arglist_full})
{ {
%if kern.has_dispatcher: % if kern.has_dispatcher : ${kern.name} _dispatcher(${kern.arglist_names});
${kern.name}_dispatcher(${kern.arglist_names});
return; return;
%endif %endif
@ -183,14 +192,13 @@ void ${kern.name}_manual(${kern.arglist_full}, const char* impl_name)
const int index = volk_gnsssdr_get_index( const int index = volk_gnsssdr_get_index(
get_machine()->${kern.name} _impl_names, get_machine()->${kern.name} _impl_names,
get_machine()->${kern.name} _n_impls, get_machine()->${kern.name} _n_impls,
impl_name impl_name);
);
get_machine()->${kern.name} _impls[index]( get_machine()->${kern.name} _impls[index](
${kern.arglist_names} ${kern.arglist_names});
);
} }
volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) { volk_gnsssdr_func_desc_t ${kern.name} _get_func_desc(void)
{
const char **impl_names = get_machine()->${kern.name} _impl_names; const char **impl_names = get_machine()->${kern.name} _impl_names;
const int *impl_deps = get_machine()->${kern.name} _impl_deps; const int *impl_deps = get_machine()->${kern.name} _impl_deps;
const bool *alignment = get_machine()->${kern.name} _impl_alignment; const bool *alignment = get_machine()->${kern.name} _impl_alignment;
@ -199,8 +207,7 @@ volk_gnsssdr_func_desc_t ${kern.name}_get_func_desc(void) {
impl_names, impl_names,
impl_deps, impl_deps,
alignment, alignment,
n_impls n_impls};
};
return desc; return desc;
} }

View File

@ -21,7 +21,8 @@
%for i, arch in enumerate(archs): %for i, arch in enumerate(archs):
//#ifndef LV_${arch.name.upper()} //#ifndef LV_${arch.name.upper()}
#define LV_${arch.name.upper()} ${i} #define LV_$ \
{arch.name.upper()} $ { i }
//#endif //#endif
%endfor %endfor

View File

@ -40,9 +40,12 @@ struct VOLK_CPU volk_gnsssdr_cpu;
* check for AVX capability before executing. * check for AVX capability before executing.
*/ */
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV) #if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3)) && defined(HAVE_XGETBV)
static inline unsigned long long _xgetbv(unsigned int index){ static inline unsigned long long _xgetbv(unsigned int index)
{
unsigned int eax, edx; unsigned int eax, edx;
__VOLK_ASM __VOLK_VOLATILE ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index)); __VOLK_ASM __VOLK_VOLATILE("xgetbv"
: "=a"(eax), "=d"(edx)
: "c"(index));
return ((unsigned long long)edx << 32) | eax; return ((unsigned long long)edx << 32) | eax;
} }
#define __xgetbv() _xgetbv(0) #define __xgetbv() _xgetbv(0)
@ -67,7 +70,8 @@ struct VOLK_CPU volk_gnsssdr_cpu;
#endif //defined(VOLK_CPU_x86) #endif //defined(VOLK_CPU_x86)
static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit) { static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int count, unsigned int reg, unsigned int bit)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
unsigned int regs[4] = {0}; unsigned int regs[4] = {0};
cpuid_x86_count(level, count, regs); cpuid_x86_count(level, count, regs);
@ -77,7 +81,8 @@ static inline unsigned int cpuid_count_x86_bit(unsigned int level, unsigned int
#endif #endif
} }
static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit) { static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsigned int bit)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
unsigned int regs[4]; unsigned int regs[4];
memset(regs, 0, sizeof(unsigned int) * 4); memset(regs, 0, sizeof(unsigned int) * 4);
@ -88,7 +93,8 @@ static inline unsigned int cpuid_x86_bit(unsigned int reg, unsigned int op, unsi
#endif #endif
} }
static inline unsigned int check_extended_cpuid(unsigned int val) { static inline unsigned int check_extended_cpuid(unsigned int val)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
unsigned int regs[4]; unsigned int regs[4];
memset(regs, 0, sizeof(unsigned int) * 4); memset(regs, 0, sizeof(unsigned int) * 4);
@ -99,7 +105,8 @@ static inline unsigned int check_extended_cpuid(unsigned int val) {
#endif #endif
} }
static inline unsigned int get_avx_enabled(void) { static inline unsigned int get_avx_enabled(void)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
return __xgetbv() & 0x6; return __xgetbv() & 0x6;
#else #else
@ -107,7 +114,8 @@ static inline unsigned int get_avx_enabled(void) {
#endif #endif
} }
static inline unsigned int get_avx2_enabled(void) { static inline unsigned int get_avx2_enabled(void)
{
#if defined(VOLK_CPU_x86) #if defined(VOLK_CPU_x86)
return __xgetbv() & 0x6; return __xgetbv() & 0x6;
#else #else
@ -123,7 +131,8 @@ static inline unsigned int get_avx2_enabled(void) {
#define VOLK_CPU_ARM #define VOLK_CPU_ARM
#endif #endif
static int has_neon(void){ static int has_neon(void)
{
#if defined(VOLK_CPU_ARM) #if defined(VOLK_CPU_ARM)
FILE *auxvec_f; FILE *auxvec_f;
unsigned long auxvec[2]; unsigned long auxvec[2];
@ -134,7 +143,8 @@ static int has_neon(void){
size_t r = 1; size_t r = 1;
//so auxv is basically 32b of ID and 32b of value //so auxv is basically 32b of ID and 32b of value
//so it goes like this //so it goes like this
while(!found_neon && r) { while (!found_neon && r)
{
r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f); r = fread(auxvec, sizeof(unsigned long), 2, auxvec_f);
if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON)) if ((auxvec[0] == AT_HWCAP) && (auxvec[1] & HWCAP_NEON))
found_neon = 1; found_neon = 1;
@ -148,50 +158,59 @@ static int has_neon(void){
} }
%for arch in archs: %for arch in archs:
static int i_can_has_${arch.name} (void) { static int i_can_has_${arch.name} (void)
{
%for check, params in arch.checks: %for check, params in arch.checks:
if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0; if (${check}(<% joined_params = ', '.join(params)%>${joined_params}) == 0) return 0;
%endfor % endfor return 1;
return 1;
} }
% endfor % endfor
#if defined(HAVE_FENV_H) #if defined(HAVE_FENV_H)
#if defined(FE_TONEAREST) #if defined(FE_TONEAREST)
#include <fenv.h> #include <fenv.h>
static inline void set_float_rounding(void){ static inline void
set_float_rounding(void)
{
fesetround(FE_TONEAREST); fesetround(FE_TONEAREST);
} }
#else #else
static inline void set_float_rounding(void){ static inline void
set_float_rounding(void)
{
//do nothing //do nothing
} }
#endif #endif
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
#include <float.h> #include <float.h>
static inline void set_float_rounding(void){ static inline void
set_float_rounding(void)
{
unsigned int cwrd; unsigned int cwrd;
_controlfp_s(&cwrd, 0, 0); _controlfp_s(&cwrd, 0, 0);
_controlfp_s(&cwrd, _RC_NEAR, _MCW_RC); _controlfp_s(&cwrd, _RC_NEAR, _MCW_RC);
} }
#else #else
static inline void set_float_rounding(void){ static inline void
set_float_rounding(void)
{
//do nothing //do nothing
} }
#endif #endif
void volk_gnsssdr_cpu_init() { void volk_gnsssdr_cpu_init()
{
%for arch in archs: %for arch in archs:
volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name}; volk_gnsssdr_cpu.has_${arch.name} = &i_can_has_${arch.name};
% endfor % endfor
set_float_rounding(); set_float_rounding();
} }
unsigned int volk_gnsssdr_get_lvarch() { unsigned int volk_gnsssdr_get_lvarch()
{
unsigned int retval = 0; unsigned int retval = 0;
volk_gnsssdr_cpu_init(); volk_gnsssdr_cpu_init();
%for arch in archs: %for arch in archs:
retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()}; retval += volk_gnsssdr_cpu.has_${arch.name}() << LV_${arch.name.upper()};
%endfor % endfor return retval;
return retval;
} }

View File

@ -23,7 +23,8 @@
__VOLK_DECL_BEGIN __VOLK_DECL_BEGIN
struct VOLK_CPU { struct VOLK_CPU
{
%for arch in archs: %for arch in archs:
int (*has_${arch.name}) (); int (*has_${arch.name}) ();
% endfor % endfor

View File

@ -20,7 +20,11 @@
<% arch_names = this_machine.arch_names %> <% arch_names = this_machine.arch_names %>
%for arch in this_machine.archs: %for arch in this_machine.archs:
#define LV_HAVE_${arch.name.upper()} 1 #define LV_HAVE_$ \
{ \
arch.name.upper() \
} \
1
%endfor %endfor
#include <volk_gnsssdr/volk_gnsssdr_common.h> #include <volk_gnsssdr/volk_gnsssdr_common.h>
@ -35,7 +39,9 @@
#include <volk_gnsssdr/${kern.name}.h> #include <volk_gnsssdr/${kern.name}.h>
%endfor %endfor
struct volk_gnsssdr_machine volk_gnsssdr_machine_${this_machine.name} = { struct volk_gnsssdr_machine volk_gnsssdr_machine_$
{
this_machine.name} = {
<% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list}, <% make_arch_have_list = (' | '.join(['(1 << LV_%s)'%a.name.upper() for a in this_machine.archs])) %> ${make_arch_have_list},
<% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name}, <% this_machine_name = "\""+this_machine.name+"\"" %> ${this_machine_name},
${this_machine.alignment}, ${this_machine.alignment},

View File

@ -27,7 +27,8 @@
__VOLK_DECL_BEGIN __VOLK_DECL_BEGIN
struct volk_gnsssdr_machine { struct volk_gnsssdr_machine
{
const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format) const unsigned int caps; //capabilities (i.e., archs compiled into this machine, in the volk_gnsssdr_get_lvarch format)
const char *name; const char *name;
const size_t alignment; //the maximum byte alignment required for functions in this library const size_t alignment; //the maximum byte alignment required for functions in this library
@ -43,7 +44,10 @@ struct volk_gnsssdr_machine {
%for machine in machines: %for machine in machines:
#ifdef LV_MACHINE_${machine.name.upper() } #ifdef LV_MACHINE_${machine.name.upper() }
extern struct volk_gnsssdr_machine volk_gnsssdr_machine_${machine.name}; extern struct volk_gnsssdr_machine volk_gnsssdr_machine_$
{
machine.name
};
#endif #endif
% endfor % endfor